[GPU] Fixed friendly name of out transpose, improve Pad performance (o…

…penvinotoolkit#8546) * Fixed friendly names in post-processing nodes * [GPU] Added fsv16 support for pad operation
apertovs · Nov 15, 2021 · 3b34f09 · 3b34f09
1 parent 5352c2b
commit 3b34f09
Show file tree

Hide file tree

Showing 8 changed files with 153 additions and 24 deletions.
diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@@ -6,6 +6,7 @@
 #include "ngraph/ops.hpp"
 #include "ngraph_ops/nms_ie_internal.hpp"
 #include "cldnn_itt.h"
+#include "cldnn/runtime/debug_configuration.hpp"
 
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
@@ -231,6 +232,12 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::s
     OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Program::CreateSingleLayerPrimitive");
     InitProfileInfo(op->get_friendly_name(), op->get_type_name());
 
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->verbose >= 2) {
+        GPU_DEBUG_COUT << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
+                       << "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;
+    }
+
     bool is_created = false;
     const ngraph::NodeTypeInfo* op_type_info = &op->get_type_info();
     while (op_type_info != nullptr) {
@@ -251,8 +258,8 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::s
 
     if (!is_created) {
         IE_THROW() << "Operation: " << op->get_friendly_name()
-                           << " of type " << op->get_type_name()
-                           << "(op::v" << op->get_type_info().version << ") is not supported";
+                   << " of type " << op->get_type_name()
+                   << "(op::v" << op->get_type_info().version << ") is not supported";
     }
 }
 

diff --git a/...-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.cpp b/...-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.cpp
@@ -23,12 +23,16 @@ ParamsKey BorderKernelRef::GetSupportedKey() const {
     k.EnableInputLayout(DataLayout::byxf);
     k.EnableInputLayout(DataLayout::bfzyx);
     k.EnableInputLayout(DataLayout::bfwzyx);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
 
     k.EnableOutputLayout(DataLayout::bfyx);
     k.EnableOutputLayout(DataLayout::yxfb);
     k.EnableOutputLayout(DataLayout::byxf);
     k.EnableOutputLayout(DataLayout::bfzyx);
     k.EnableOutputLayout(DataLayout::bfwzyx);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
 
     k.EnableTensorOffset();
     k.EnableTensorPitches();

diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/border_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/border_gpu_ref.cl
@@ -5,10 +5,35 @@
 #include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"
 
+inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
+{
+#if INPUT0_DIMS < 5
+    return INPUT0_GET_INDEX(b, f, y, x);
+#elif INPUT0_DIMS == 5
+    return INPUT0_GET_INDEX(b, f, z, y, x);
+#elif INPUT0_DIMS == 6
+    return INPUT0_GET_INDEX(b, f, w, z, y, x);
+#else
+#error [clDNN border_gpu_ref.cl]: input format - not supported
+#endif
+}
+
+inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
+{
+#if OUTPUT_DIMS < 5
+    return OUTPUT_GET_INDEX(b, f, y, x);
+#elif OUTPUT_DIMS == 5
+    return OUTPUT_GET_INDEX(b, f, z, y, x);
+#elif OUTPUT_DIMS == 6
+    return OUTPUT_GET_INDEX(b, f, w, z, y, x);
+#else
+#error [clDNN border_gpu_ref.cl]: output format - not supported
+#endif
+}
 
 KERNEL(border_gpu_ref)(
-    const __global UNIT_TYPE* input,
-    __global UNIT_TYPE* output)
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output)
 {
     // [CONSTEXPR]
     // Border sizes (left-top set and right-bottom set):
@@ -72,7 +97,7 @@ KERNEL(border_gpu_ref)(
     const uint out_w  = out_yw / OUTPUT_SIZE_Y;
 
 #ifdef BORDER_TYPE_CONSTANT
-    UNIT_TYPE in_val = TO_UNIT_TYPE(BORDER_VALUE);
+    INPUT0_TYPE in_val = TO_INPUT0_TYPE(BORDER_VALUE);
 
     if (out_x >= blt_sx & out_x < in_lx &
         out_y >= blt_sy & out_y < in_ly &
@@ -88,7 +113,7 @@ KERNEL(border_gpu_ref)(
         const uint in_f = out_f - blt_sf;
         const uint in_b = out_b - blt_sb;
 
-        const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
+        const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
         in_val = input[in_pos];
     }
 #elif defined BORDER_TYPE_EDGE
@@ -99,8 +124,8 @@ KERNEL(border_gpu_ref)(
     const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? 0 : in_sf - 1);
     const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? 0 : in_sb - 1);
 
-    const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
-    UNIT_TYPE in_val = input[in_pos];
+    const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
+    INPUT0_TYPE in_val = input[in_pos];
 #elif defined BORDER_TYPE_MIRROR
     const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - 1 - out_x : in_sx + in_lx - 1 - out_x);
     const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? blt_sy - 1 - out_y : in_sy + in_ly - 1 - out_y);
@@ -109,8 +134,8 @@ KERNEL(border_gpu_ref)(
     const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - 1 - out_f : in_sf + in_lf - 1 - out_f);
     const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - 1 - out_b : in_sb + in_lb - 1 - out_b);
 
-    const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
-    UNIT_TYPE in_val = input[in_pos];
+    const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
+    INPUT0_TYPE in_val = input[in_pos];
 #elif defined BORDER_TYPE_MIRROR_101
     const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - out_x : in_sx + in_lx - 2 - out_x);
     const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? blt_sy - out_y : in_sy + in_ly - 2 - out_y);
@@ -119,12 +144,12 @@ KERNEL(border_gpu_ref)(
     const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - out_f : in_sf + in_lf - 2 - out_f);
     const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - out_b : in_sb + in_lb - 2 - out_b);
 
-    const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
-    UNIT_TYPE in_val = input[in_pos];
+    const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
+    INPUT0_TYPE in_val = input[in_pos];
 #else
     #error Unsupported border type.
 #endif
 
-    const uint out_pos = GET_DATA_INDEX_6D(OUTPUT, out_b, out_f, out_w, out_z, out_y, out_x);
+    const uint out_pos = FUNC_CALL(get_output_index)(out_b, out_f, out_w, out_z, out_y, out_x);
     output[out_pos] = in_val;
 }
diff --git a/inference-engine/thirdparty/clDNN/src/border.cpp b/inference-engine/thirdparty/clDNN/src/border.cpp
@@ -87,23 +87,12 @@ std::string border_inst::to_string(border_node const& node) {
 border_inst::typed_primitive_inst(network& network, border_node const& node) : parent(network, node) {
     auto input_layout = node.input().get_output_layout();
 
-    const auto input_format = input_layout.format;
     const auto& input_sizes = input_layout.size;
 
     auto lt_sizes = argument.left_top_sizes.sub(tensor(0));
     auto rb_sizes = argument.right_bottom_sizes.sub(tensor(0));
     auto b_type = argument.type;
 
-    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                  "Input format",
-                                  input_format.value,
-                                  "supported border primitive input formats",
-                                  format::bfyx,
-                                  format::yxfb,
-                                  format::byxf,
-                                  format::bfzyx,
-                                  format::bfwzyx);
-
     tensor null_tensor = tensor(0);
 
     // Check if sizes of border are in proper range.

diff --git a/inference-engine/thirdparty/clDNN/src/impls/ocl/border.cpp b/inference-engine/thirdparty/clDNN/src/impls/ocl/border.cpp
@@ -72,22 +72,36 @@ attach_border_impl::attach_border_impl() {
         std::make_tuple(data_types::f16, format::yxfb),
         std::make_tuple(data_types::i8, format::yxfb),
         std::make_tuple(data_types::u8, format::yxfb),
+
         std::make_tuple(data_types::f32, format::bfyx),
         std::make_tuple(data_types::f16, format::bfyx),
         std::make_tuple(data_types::i8, format::bfyx),
         std::make_tuple(data_types::u8, format::bfyx),
+
         std::make_tuple(data_types::f32, format::byxf),
         std::make_tuple(data_types::f16, format::byxf),
         std::make_tuple(data_types::i8, format::byxf),
         std::make_tuple(data_types::u8, format::byxf),
+
         std::make_tuple(data_types::f32, format::bfzyx),
         std::make_tuple(data_types::f16, format::bfzyx),
         std::make_tuple(data_types::i8, format::bfzyx),
         std::make_tuple(data_types::u8, format::bfzyx),
+
         std::make_tuple(data_types::f32, format::bfwzyx),
         std::make_tuple(data_types::f16, format::bfwzyx),
         std::make_tuple(data_types::i8, format::bfwzyx),
         std::make_tuple(data_types::u8, format::bfwzyx),
+
+        std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
+        std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
+        std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
+        std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
+
+        std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
+        std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
+        std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
+        std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
     });
 }
 

diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp
@@ -47,6 +47,7 @@
 #include "lstm_gemm_inst.h"
 #include "mutable_data_inst.h"
 #include "pooling_inst.h"
+#include "border_inst.h"
 #include "primitive_inst.h"
 #include "prior_box_inst.h"
 #include "proposal_inst.h"
@@ -1295,6 +1296,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::input_layout::type_id() &&
             prim.type() != cldnn::softmax::type_id() &&
             prim.type() != cldnn::prior_box::type_id() &&
+            prim.type() != cldnn::border::type_id() &&
             prim.type() != cldnn::resample::type_id() &&
             prim.type() != cldnn::crop::type_id() &&
             prim.type() != cldnn::scale::type_id() &&

diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/border_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/border_gpu_test.cpp
@@ -106,6 +106,84 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_constant) {
     }
 }
 
+TEST(border_gpu, basic_fsv16_0x0x1x2_0x0x3x4_border_constant) {
+    //  Input (XY) : 4x3
+    //  Output (XY): 10x7
+
+    constexpr auto in_size_b = 1;
+    constexpr auto in_size_f = 1;
+    constexpr auto in_size_y = 3;
+    constexpr auto in_size_x = 4;
+
+    constexpr auto blt_size_b = 0;
+    constexpr auto blt_size_f = 0;
+    constexpr auto blt_size_y = 1;
+    constexpr auto blt_size_x = 2;
+
+    constexpr auto brb_size_b = 0;
+    constexpr auto brb_size_f = 0;
+    constexpr auto brb_size_y = 3;
+    constexpr auto brb_size_x = 4;
+
+    constexpr auto out_size_b = in_size_b + blt_size_b + brb_size_b;
+    constexpr auto out_size_f = in_size_f + blt_size_f + brb_size_f;
+    constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y;
+    constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x;
+
+    auto& engine = get_test_engine();
+    auto input = engine.allocate_memory({data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}});
+
+    topology topology;
+    topology.add(
+        input_layout("input", input->get_layout())
+    );
+    topology.add(
+        reorder("border_input", "input", cldnn::format::b_fs_yx_fsv16, cldnn::data_types::f32),
+        border("border", "border_input",
+               {blt_size_b, blt_size_f, blt_size_x, blt_size_y},
+               {brb_size_b, brb_size_f, brb_size_x, brb_size_y},
+               border_type::constant, 0.0f),
+        reorder("output", "border", cldnn::format::yxfb, cldnn::data_types::f32)
+    );
+
+    std::vector<float> input_data = {
+          1, -2,  3,  -4,
+          5,  6,  7,   8,
+        -10, 12, 13, -13,
+    };
+    std::vector<float> out_data = {
+        0, 0,   0,  0,  0,   0, 0, 0, 0, 0,
+        0, 0,   1, -2,  3,  -4, 0, 0, 0, 0,
+        0, 0,   5,  6,  7,   8, 0, 0, 0, 0,
+        0, 0, -10, 12, 13, -13, 0, 0, 0, 0,
+        0, 0,   0,  0,  0,   0, 0, 0, 0, 0,
+        0, 0,   0,  0,  0,   0, 0, 0, 0, 0,
+        0, 0,   0,  0,  0,   0, 0, 0, 0, 0,
+    };
+    set_values(input, input_data);
+
+    cldnn::network network(engine, topology);
+    network.set_input_data("input", input);
+    auto outputs = network.execute();
+
+    auto output = outputs.at("output").get_memory();
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    ASSERT_EQ(out_data.size(), static_cast<std::size_t>(out_size_b * out_size_f * out_size_y * out_size_x));
+
+    for (auto b = 0; b < out_size_b; ++b) {             // B
+        for (auto f = 0; f < out_size_f; ++f) {         // F
+            for (auto y = 0; y < out_size_y; ++y) {     // Y
+                for (auto x = 0; x < out_size_x; ++x) { // X
+                    auto output_off = ((y * out_size_x + x) * out_size_f + f) * out_size_b + b; // YXFB
+
+                    EXPECT_EQ(output_ptr[output_off], out_data[output_off]);
+                }
+            }
+        }
+    }
+}
+
 TEST(border_gpu, basic_bfzyx_0x0x1x01_0x0x0x0x3_border_constant) {
 
     constexpr auto in_size_b = 1;

diff --git a/ngraph/core/src/preprocess/pre_post_process.cpp b/ngraph/core/src/preprocess/pre_post_process.cpp
@@ -542,10 +542,12 @@ std::shared_ptr<Function> PrePostProcessor::build(const std::shared_ptr<Function
         }
         // Apply post-processing
         node = result->get_input_source_output(0);
+        bool post_processing_applied = false;
         if (output->m_postprocess) {
             for (const auto& action : output->m_postprocess->actions()) {
                 auto action_result = action({node}, context);
                 node = std::get<0>(action_result);
+                post_processing_applied = true;
             }
         }
         // Implicit: Convert element type + layout to user's tensor implicitly
@@ -561,10 +563,18 @@ std::shared_ptr<Function> PrePostProcessor::build(const std::shared_ptr<Function
         for (const auto& action : implicit_steps.actions()) {
             auto action_result = action({node}, context);
             node = std::get<0>(action_result);
+            post_processing_applied = true;
         }
         node.get_node_shared_ptr()->set_friendly_name(
             result->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name());
 
+        // Reset friendly name of input node to avoid names collision
+        // when there is at a new node inserted by post-processing steps
+        // If no new nodes are inserted by post-processing, then we need to preserve friendly name of input
+        // as it's required for old API correct work
+        if (post_processing_applied)
+            result->get_input_source_output(0).get_node_shared_ptr()->set_friendly_name("");
+
         // Create result
         auto new_result = std::make_shared<ov::op::v0::Result>(node);
         new_result->set_friendly_name(result->get_friendly_name());