Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Added shape agnostic Pad ref kernel #16160

Merged
merged 1 commit into from
Mar 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 48 additions & 101 deletions src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ struct border_impl : typed_primitive_impl_ocl<border> {
return make_unique<border_impl>(*this);
}

static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<border>();
auto params = get_default_params<kernel_selector::border_params>(impl_param);
auto params = get_default_params<kernel_selector::border_params>(impl_param, is_shape_agnostic);
auto optional_params = get_default_optional_params<kernel_selector::border_optional_params>(impl_param.get_program());

size_t rank = impl_param.get_input_layout(0).get_rank();
Expand All @@ -36,7 +36,7 @@ struct border_impl : typed_primitive_impl_ocl<border> {
std::vector<int32_t> end(primitive->pads_end.begin(), primitive->pads_end.end());

size_t input_offset = 1;
if (!(primitive->non_constant_input_mask & border::PAD_NON_CONST_INPUT::BEGIN) && !params.has_dynamic_tensors()) {
if (!(primitive->non_constant_input_mask & border::PAD_NON_CONST_INPUT::BEGIN)) {
params.begin_type = kernel_selector::base_params::ArgType::Constant;

std::vector<int64_t> begin_vec;
Expand All @@ -55,7 +55,7 @@ struct border_impl : typed_primitive_impl_ocl<border> {
input_offset += 1;
}

if (!(primitive->non_constant_input_mask & border::PAD_NON_CONST_INPUT::END) && !params.has_dynamic_tensors()) {
if (!(primitive->non_constant_input_mask & border::PAD_NON_CONST_INPUT::END)) {
params.end_type = kernel_selector::base_params::ArgType::Constant;

std::vector<int64_t> end_vec;
Expand Down Expand Up @@ -102,108 +102,55 @@ struct border_impl : typed_primitive_impl_ocl<border> {

return {params, optional_params};
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
update_kernels_list_to_skip();
}
};

namespace detail {

attach_border_impl::attach_border_impl() {
implementation_map<border>::add(impl_types::ocl, typed_primitive_impl_ocl<border>::create<border_impl>, {
std::make_tuple(data_types::f32, format::yxfb),
std::make_tuple(data_types::f16, format::yxfb),
std::make_tuple(data_types::i32, format::yxfb),
std::make_tuple(data_types::i8, format::yxfb),
std::make_tuple(data_types::u8, format::yxfb),

std::make_tuple(data_types::f32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::i32, format::bfyx),
std::make_tuple(data_types::i8, format::bfyx),
std::make_tuple(data_types::u8, format::bfyx),

std::make_tuple(data_types::f32, format::byxf),
std::make_tuple(data_types::f16, format::byxf),
std::make_tuple(data_types::i32, format::byxf),
std::make_tuple(data_types::i8, format::byxf),
std::make_tuple(data_types::u8, format::byxf),

std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::i32, format::bfzyx),
std::make_tuple(data_types::i8, format::bfzyx),
std::make_tuple(data_types::u8, format::bfzyx),

std::make_tuple(data_types::f32, format::bfwzyx),
std::make_tuple(data_types::f16, format::bfwzyx),
std::make_tuple(data_types::i32, format::bfwzyx),
std::make_tuple(data_types::i8, format::bfwzyx),
std::make_tuple(data_types::u8, format::bfwzyx),

std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),

std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
std::make_tuple(data_types::i32, format::b_fs_yx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),

std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i32, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),

std::make_tuple(data_types::f32, format::bs_fs_yx_bsv4_fsv2),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv4_fsv2),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv4_fsv2),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv4_fsv2),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv4_fsv2),

std::make_tuple(data_types::f32, format::bs_fs_yx_bsv4_fsv4),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv4_fsv4),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv4_fsv4),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv4_fsv4),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv4_fsv4),

std::make_tuple(data_types::f32, format::bs_fs_yx_bsv8_fsv2),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv8_fsv2),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv8_fsv2),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv8_fsv2),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv8_fsv2),

std::make_tuple(data_types::f32, format::bs_fs_yx_bsv8_fsv4),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv8_fsv4),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv8_fsv4),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv8_fsv4),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv8_fsv4),

std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),

std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv16),

std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),

std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::i32, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
});
auto types = {data_types::f32, data_types::f16, data_types::i32, data_types::i8, data_types::u8};

auto formats = {
format::yxfb,
format::bfyx,
format::byxf,
format::bfzyx,
format::bfwzyx,
format::b_fs_yx_fsv16,
format::b_fs_yx_fsv32,
format::b_fs_zyx_fsv16,
format::bs_fs_yx_bsv4_fsv2,
format::bs_fs_yx_bsv4_fsv4,
format::bs_fs_yx_bsv8_fsv2,
format::bs_fs_yx_bsv8_fsv4,
format::bs_fs_yx_bsv16_fsv16,
format::bs_fs_yx_bsv32_fsv16,
format::bs_fs_yx_bsv32_fsv32,
format::bs_fs_zyx_bsv16_fsv16
};

implementation_map<border>::add(impl_types::ocl,
shape_types::static_shape,
typed_primitive_impl_ocl<border>::create<border_impl>,
types,
formats);

auto dyn_formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx
};

implementation_map<border>::add(impl_types::ocl,
shape_types::dynamic_shape,
typed_primitive_impl_ocl<border>::create<border_impl>,
types,
dyn_formats);
}

} // namespace detail
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "include/fetch_utils.cl"

KERNEL(border_gpu_ref)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
#ifdef BEGIN_TYPE
const __global BEGIN_TYPE* begin,
Expand Down Expand Up @@ -154,7 +155,7 @@ KERNEL(border_gpu_ref)(
const uint in_f = out_f - blt_sf;
const uint in_b = out_b - blt_sb;

const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
const uint in_pos = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR in_b, in_f, in_w, in_z, in_y, in_x);
in_val = input[in_pos];
}
#elif defined BORDER_TYPE_EDGE
Expand All @@ -165,7 +166,7 @@ KERNEL(border_gpu_ref)(
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? 0 : in_sf - 1);
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? 0 : in_sb - 1);

const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
const uint in_pos = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR in_b, in_f, in_w, in_z, in_y, in_x);
INPUT0_TYPE in_val = input[in_pos];
#elif defined BORDER_TYPE_MIRROR
const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - 1 - out_x : in_sx + in_lx - 1 - out_x);
Expand All @@ -175,7 +176,7 @@ KERNEL(border_gpu_ref)(
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - 1 - out_f : in_sf + in_lf - 1 - out_f);
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - 1 - out_b : in_sb + in_lb - 1 - out_b);

const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
const uint in_pos = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR in_b, in_f, in_w, in_z, in_y, in_x);
INPUT0_TYPE in_val = input[in_pos];
#elif defined BORDER_TYPE_MIRROR_101
const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - out_x : in_sx + in_lx - 2 - out_x);
Expand All @@ -185,12 +186,12 @@ KERNEL(border_gpu_ref)(
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - out_f : in_sf + in_lf - 2 - out_f);
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - out_b : in_sb + in_lb - 2 - out_b);

const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
const uint in_pos = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR in_b, in_f, in_w, in_z, in_y, in_x);
INPUT0_TYPE in_val = input[in_pos];
#else
#error Unsupported border type.
#endif

const uint out_pos = FUNC_CALL(get_output_index)(out_b, out_f, out_w, out_z, out_y, out_x);
const uint out_pos = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_w, out_z, out_y, out_x);
output[out_pos] = in_val;
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,16 @@ BorderKernelBase::DispatchData BorderKernelBase::SetDefault(const border_params&
const auto& output = params.outputs[0];

DispatchData dispatchData;
auto in_layout = params.inputs[0].GetLayout();
auto out_layout = params.outputs[0].GetLayout();
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Z },
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::W },
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};

dispatchData.gws = { output.X().v * output.Z().v, output.Y().v * output.W().v, output.Batch().v * output.Feature().v };
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
if (!params.has_dynamic_tensors()) {
auto in_layout = params.inputs[0].GetLayout();
auto out_layout = params.outputs[0].GetLayout();
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Z },
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::W },
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};

dispatchData.gws = { output.X().v * output.Z().v, output.Y().v * output.W().v, output.Batch().v * output.Feature().v };
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
}

return dispatchData;
}
Expand All @@ -67,16 +69,32 @@ KernelsData BorderKernelBase::GetCommonKernelsData(const Params& params,

auto dispatchData = SetDefault(prim_params);
KernelData k_data = KernelData::Default<border_params>(params);
border_params& newParams = *static_cast<border_params*>(k_data.params.get());
k_data.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
const auto& prim_params = static_cast<const border_params&>(params);
auto dispatchData = SetDefault(prim_params);
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
kd.kernels[0].params.workGroups.global = dispatchData.gws;
kd.kernels[0].params.workGroups.local = dispatchData.lws;
};

auto cldnn_jit = GetJitConstants(prim_params);
auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);

auto& kernel = k_data.kernels[0];
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
"", false, false, static_cast<int>(newParams.inputs.size()),
0, 1, newParams.has_dynamic_tensors());
FillCLKernelData(kernel,
dispatchData,
params.engineInfo,
kernelName,
jit,
entry_point,
EXE_MODE_DEFAULT,
false,
false,
(uint32_t)prim_params.inputs.size(),
GetFusedPrimitiveInputsCount(params),
1,
prim_params.outputs[0].is_dynamic());

return {k_data};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ ParamsKey BorderKernelRef::GetSupportedKey() const {
k.EnableTensorPitches();
k.EnableBatching();
k.EnableDifferentTypes();
k.EnableDynamicShapesSupport();
return k;
}

Expand Down
80 changes: 80 additions & 0 deletions src/plugins/intel_gpu/tests/test_cases/border_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/border.hpp>

#include <border_inst.h>

#include <cstddef>
#include <array>

Expand Down Expand Up @@ -1551,3 +1553,81 @@ TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_edge) {
}
}
}

TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_constant_dynamic) {
constexpr auto in_size_b = 2;
constexpr auto in_size_f = 3;
constexpr auto in_size_y = 5;
constexpr auto in_size_x = 4;

constexpr auto blt_size_b = 2;
constexpr auto blt_size_f = 1;
constexpr auto blt_size_y = 2;
constexpr auto blt_size_x = 3;

constexpr auto brb_size_b = 1;
constexpr auto brb_size_f = 2;
constexpr auto brb_size_y = 3;
constexpr auto brb_size_x = 4;

constexpr auto out_size_b = in_size_b + blt_size_b + brb_size_b;
constexpr auto out_size_f = in_size_f + blt_size_f + brb_size_f;
constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y;
constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x;

auto& engine = get_test_engine();
auto input_layout_dynamic = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
auto input_layout_static = layout{ov::PartialShape{in_size_b, in_size_f, in_size_y, in_size_x}, data_types::f32, format::bfyx};
auto input = engine.allocate_memory(input_layout_static);

topology topology;
topology.add(input_layout("input", input_layout_dynamic));
topology.add(border("border",
{input_info("input")}, 0,
ov::CoordinateDiff{blt_size_b, blt_size_f, blt_size_y, blt_size_x},
ov::CoordinateDiff{brb_size_b, brb_size_f, brb_size_y, brb_size_x},
ov::op::PadMode::CONSTANT,
0.0f));

const std::vector<size_t> sizes{ static_cast<std::size_t>(in_size_b), static_cast<std::size_t>(in_size_f),
static_cast<std::size_t>(in_size_y), static_cast<std::size_t>(in_size_x)};
std::vector<float> input_data = generate_rnd_real_input<float>(sizes, -8.0f, 8.0f);
set_values(input, input_data);

ExecutionConfig config;
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("input", input);

auto inst = network.get_primitive("border");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());

auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "border");

auto output = outputs.at("border").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());

for (auto b = 0; b < out_size_b; ++b) { // B
for (auto f = 0; f < out_size_f; ++f) { // F
for (auto y = 0; y < out_size_y; ++y) { // Y
for (auto x = 0; x < out_size_x; ++x) { // X
auto output_off = ((b * out_size_f + f) * out_size_y + y) * out_size_x + x; // BFYX

if (b < blt_size_b || b >= out_size_b - brb_size_b ||
f < blt_size_f || f >= out_size_f - brb_size_f ||
y < blt_size_y || y >= out_size_y - brb_size_y ||
x < blt_size_x || x >= out_size_x - brb_size_x) {
ASSERT_EQ(output_ptr[output_off], 0.0f);
} else {
auto input_off = (((b - blt_size_b) * in_size_f + f - blt_size_f) * in_size_y + y - blt_size_y) * in_size_x + x - blt_size_x; // BFYX
ASSERT_EQ(output_ptr[output_off], input_data[input_off]);
}
}
}
}
}
}