Skip to content

Commit

Permalink
[GPU] Fixed friendly name of out transpose, improve Pad performance (o…
Browse files Browse the repository at this point in the history
…penvinotoolkit#8546)

* Fixed friendly names in post-processing nodes

* [GPU] Added fsv16 support for pad operation
  • Loading branch information
vladimir-paramuzov authored Nov 15, 2021
1 parent 5352c2b commit 3b34f09
Show file tree
Hide file tree
Showing 8 changed files with 153 additions and 24 deletions.
11 changes: 9 additions & 2 deletions inference-engine/src/cldnn_engine/cldnn_program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "ngraph/ops.hpp"
#include "ngraph_ops/nms_ie_internal.hpp"
#include "cldnn_itt.h"
#include "cldnn/runtime/debug_configuration.hpp"

using namespace InferenceEngine;
using namespace InferenceEngine::details;
Expand Down Expand Up @@ -231,6 +232,12 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::s
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Program::CreateSingleLayerPrimitive");
InitProfileInfo(op->get_friendly_name(), op->get_type_name());

GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
<< "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;
}

bool is_created = false;
const ngraph::NodeTypeInfo* op_type_info = &op->get_type_info();
while (op_type_info != nullptr) {
Expand All @@ -251,8 +258,8 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::s

if (!is_created) {
IE_THROW() << "Operation: " << op->get_friendly_name()
<< " of type " << op->get_type_name()
<< "(op::v" << op->get_type_info().version << ") is not supported";
<< " of type " << op->get_type_name()
<< "(op::v" << op->get_type_info().version << ") is not supported";
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@ ParamsKey BorderKernelRef::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::byxf);
k.EnableInputLayout(DataLayout::bfzyx);
k.EnableInputLayout(DataLayout::bfwzyx);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);

k.EnableOutputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::yxfb);
k.EnableOutputLayout(DataLayout::byxf);
k.EnableOutputLayout(DataLayout::bfzyx);
k.EnableOutputLayout(DataLayout::bfwzyx);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);

k.EnableTensorOffset();
k.EnableTensorPitches();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,35 @@
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"

inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
{
#if INPUT0_DIMS < 5
return INPUT0_GET_INDEX(b, f, y, x);
#elif INPUT0_DIMS == 5
return INPUT0_GET_INDEX(b, f, z, y, x);
#elif INPUT0_DIMS == 6
return INPUT0_GET_INDEX(b, f, w, z, y, x);
#else
#error [clDNN border_gpu_ref.cl]: input format - not supported
#endif
}

inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
{
#if OUTPUT_DIMS < 5
return OUTPUT_GET_INDEX(b, f, y, x);
#elif OUTPUT_DIMS == 5
return OUTPUT_GET_INDEX(b, f, z, y, x);
#elif OUTPUT_DIMS == 6
return OUTPUT_GET_INDEX(b, f, w, z, y, x);
#else
#error [clDNN border_gpu_ref.cl]: output format - not supported
#endif
}

KERNEL(border_gpu_ref)(
const __global UNIT_TYPE* input,
__global UNIT_TYPE* output)
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output)
{
// [CONSTEXPR]
// Border sizes (left-top set and right-bottom set):
Expand Down Expand Up @@ -72,7 +97,7 @@ KERNEL(border_gpu_ref)(
const uint out_w = out_yw / OUTPUT_SIZE_Y;

#ifdef BORDER_TYPE_CONSTANT
UNIT_TYPE in_val = TO_UNIT_TYPE(BORDER_VALUE);
INPUT0_TYPE in_val = TO_INPUT0_TYPE(BORDER_VALUE);

if (out_x >= blt_sx & out_x < in_lx &
out_y >= blt_sy & out_y < in_ly &
Expand All @@ -88,7 +113,7 @@ KERNEL(border_gpu_ref)(
const uint in_f = out_f - blt_sf;
const uint in_b = out_b - blt_sb;

const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
in_val = input[in_pos];
}
#elif defined BORDER_TYPE_EDGE
Expand All @@ -99,8 +124,8 @@ KERNEL(border_gpu_ref)(
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? 0 : in_sf - 1);
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? 0 : in_sb - 1);

const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
UNIT_TYPE in_val = input[in_pos];
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
INPUT0_TYPE in_val = input[in_pos];
#elif defined BORDER_TYPE_MIRROR
const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - 1 - out_x : in_sx + in_lx - 1 - out_x);
const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? blt_sy - 1 - out_y : in_sy + in_ly - 1 - out_y);
Expand All @@ -109,8 +134,8 @@ KERNEL(border_gpu_ref)(
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - 1 - out_f : in_sf + in_lf - 1 - out_f);
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - 1 - out_b : in_sb + in_lb - 1 - out_b);

const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
UNIT_TYPE in_val = input[in_pos];
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
INPUT0_TYPE in_val = input[in_pos];
#elif defined BORDER_TYPE_MIRROR_101
const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - out_x : in_sx + in_lx - 2 - out_x);
const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? blt_sy - out_y : in_sy + in_ly - 2 - out_y);
Expand All @@ -119,12 +144,12 @@ KERNEL(border_gpu_ref)(
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - out_f : in_sf + in_lf - 2 - out_f);
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - out_b : in_sb + in_lb - 2 - out_b);

const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
UNIT_TYPE in_val = input[in_pos];
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
INPUT0_TYPE in_val = input[in_pos];
#else
#error Unsupported border type.
#endif

const uint out_pos = GET_DATA_INDEX_6D(OUTPUT, out_b, out_f, out_w, out_z, out_y, out_x);
const uint out_pos = FUNC_CALL(get_output_index)(out_b, out_f, out_w, out_z, out_y, out_x);
output[out_pos] = in_val;
}
11 changes: 0 additions & 11 deletions inference-engine/thirdparty/clDNN/src/border.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,23 +87,12 @@ std::string border_inst::to_string(border_node const& node) {
border_inst::typed_primitive_inst(network& network, border_node const& node) : parent(network, node) {
auto input_layout = node.input().get_output_layout();

const auto input_format = input_layout.format;
const auto& input_sizes = input_layout.size;

auto lt_sizes = argument.left_top_sizes.sub(tensor(0));
auto rb_sizes = argument.right_bottom_sizes.sub(tensor(0));
auto b_type = argument.type;

CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
"Input format",
input_format.value,
"supported border primitive input formats",
format::bfyx,
format::yxfb,
format::byxf,
format::bfzyx,
format::bfwzyx);

tensor null_tensor = tensor(0);

// Check if sizes of border are in proper range.
Expand Down
14 changes: 14 additions & 0 deletions inference-engine/thirdparty/clDNN/src/impls/ocl/border.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,22 +72,36 @@ attach_border_impl::attach_border_impl() {
std::make_tuple(data_types::f16, format::yxfb),
std::make_tuple(data_types::i8, format::yxfb),
std::make_tuple(data_types::u8, format::yxfb),

std::make_tuple(data_types::f32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::i8, format::bfyx),
std::make_tuple(data_types::u8, format::bfyx),

std::make_tuple(data_types::f32, format::byxf),
std::make_tuple(data_types::f16, format::byxf),
std::make_tuple(data_types::i8, format::byxf),
std::make_tuple(data_types::u8, format::byxf),

std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::i8, format::bfzyx),
std::make_tuple(data_types::u8, format::bfzyx),

std::make_tuple(data_types::f32, format::bfwzyx),
std::make_tuple(data_types::f16, format::bfwzyx),
std::make_tuple(data_types::i8, format::bfwzyx),
std::make_tuple(data_types::u8, format::bfwzyx),

std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),

std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
});
}

Expand Down
2 changes: 2 additions & 0 deletions inference-engine/thirdparty/clDNN/src/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "lstm_gemm_inst.h"
#include "mutable_data_inst.h"
#include "pooling_inst.h"
#include "border_inst.h"
#include "primitive_inst.h"
#include "prior_box_inst.h"
#include "proposal_inst.h"
Expand Down Expand Up @@ -1295,6 +1296,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
prim.type() != cldnn::input_layout::type_id() &&
prim.type() != cldnn::softmax::type_id() &&
prim.type() != cldnn::prior_box::type_id() &&
prim.type() != cldnn::border::type_id() &&
prim.type() != cldnn::resample::type_id() &&
prim.type() != cldnn::crop::type_id() &&
prim.type() != cldnn::scale::type_id() &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,84 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_constant) {
}
}

TEST(border_gpu, basic_fsv16_0x0x1x2_0x0x3x4_border_constant) {
// Input (XY) : 4x3
// Output (XY): 10x7

constexpr auto in_size_b = 1;
constexpr auto in_size_f = 1;
constexpr auto in_size_y = 3;
constexpr auto in_size_x = 4;

constexpr auto blt_size_b = 0;
constexpr auto blt_size_f = 0;
constexpr auto blt_size_y = 1;
constexpr auto blt_size_x = 2;

constexpr auto brb_size_b = 0;
constexpr auto brb_size_f = 0;
constexpr auto brb_size_y = 3;
constexpr auto brb_size_x = 4;

constexpr auto out_size_b = in_size_b + blt_size_b + brb_size_b;
constexpr auto out_size_f = in_size_f + blt_size_f + brb_size_f;
constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y;
constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x;

auto& engine = get_test_engine();
auto input = engine.allocate_memory({data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}});

topology topology;
topology.add(
input_layout("input", input->get_layout())
);
topology.add(
reorder("border_input", "input", cldnn::format::b_fs_yx_fsv16, cldnn::data_types::f32),
border("border", "border_input",
{blt_size_b, blt_size_f, blt_size_x, blt_size_y},
{brb_size_b, brb_size_f, brb_size_x, brb_size_y},
border_type::constant, 0.0f),
reorder("output", "border", cldnn::format::yxfb, cldnn::data_types::f32)
);

std::vector<float> input_data = {
1, -2, 3, -4,
5, 6, 7, 8,
-10, 12, 13, -13,
};
std::vector<float> out_data = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, -2, 3, -4, 0, 0, 0, 0,
0, 0, 5, 6, 7, 8, 0, 0, 0, 0,
0, 0, -10, 12, 13, -13, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
set_values(input, input_data);

cldnn::network network(engine, topology);
network.set_input_data("input", input);
auto outputs = network.execute();

auto output = outputs.at("output").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());

ASSERT_EQ(out_data.size(), static_cast<std::size_t>(out_size_b * out_size_f * out_size_y * out_size_x));

for (auto b = 0; b < out_size_b; ++b) { // B
for (auto f = 0; f < out_size_f; ++f) { // F
for (auto y = 0; y < out_size_y; ++y) { // Y
for (auto x = 0; x < out_size_x; ++x) { // X
auto output_off = ((y * out_size_x + x) * out_size_f + f) * out_size_b + b; // YXFB

EXPECT_EQ(output_ptr[output_off], out_data[output_off]);
}
}
}
}
}

TEST(border_gpu, basic_bfzyx_0x0x1x01_0x0x0x0x3_border_constant) {

constexpr auto in_size_b = 1;
Expand Down
10 changes: 10 additions & 0 deletions ngraph/core/src/preprocess/pre_post_process.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -542,10 +542,12 @@ std::shared_ptr<Function> PrePostProcessor::build(const std::shared_ptr<Function
}
// Apply post-processing
node = result->get_input_source_output(0);
bool post_processing_applied = false;
if (output->m_postprocess) {
for (const auto& action : output->m_postprocess->actions()) {
auto action_result = action({node}, context);
node = std::get<0>(action_result);
post_processing_applied = true;
}
}
// Implicit: Convert element type + layout to user's tensor implicitly
Expand All @@ -561,10 +563,18 @@ std::shared_ptr<Function> PrePostProcessor::build(const std::shared_ptr<Function
for (const auto& action : implicit_steps.actions()) {
auto action_result = action({node}, context);
node = std::get<0>(action_result);
post_processing_applied = true;
}
node.get_node_shared_ptr()->set_friendly_name(
result->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name());

// Reset friendly name of input node to avoid names collision
// when there is at a new node inserted by post-processing steps
// If no new nodes are inserted by post-processing, then we need to preserve friendly name of input
// as it's required for old API correct work
if (post_processing_applied)
result->get_input_source_output(0).get_node_shared_ptr()->set_friendly_name("");

// Create result
auto new_result = std::make_shared<ov::op::v0::Result>(node);
new_result->set_friendly_name(result->get_friendly_name());
Expand Down

0 comments on commit 3b34f09

Please sign in to comment.