[GPU] Fuse more 'type conversion only' reorders (openvinotoolkit#25270)

### Details: - Reorder which is only converting type can be fused to the prior node ### Tickets: - 144957
ynimmaga · Jul 3, 2024 · 1a26e6f · 1a26e6f
1 parent 626966b
commit 1a26e6f
Show file tree

Hide file tree

Showing 15 changed files with 252 additions and 59 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -11,6 +11,10 @@
 #include "convert_color_inst.h"
 #include "one_hot_inst.h"
 #include "shape_of_inst.h"
+#include "gather_inst.h"
+#include "select_inst.h"
+#include "eltwise_inst.h"
+#include "broadcast_inst.h"
 #include "permute_inst.h"
 #include "depth_to_space_inst.h"
 #include "concatenation_inst.h"
@@ -410,8 +414,11 @@ void remove_redundant_reorders::run(program& p) {
                 continue;
 
             bool same_data_type = input.get_output_layout().data_type == output_layout.data_type;
-            bool allowed_dt_conversion_fuse = (input.is_type<one_hot>() || input.is_type<permute>() || input.is_type<mvn>() || input.is_type<concatenation>() ||
-                                               input.is_type<depth_to_space>() || input.is_type<region_yolo>() || input.is_type<detection_output>());
+            bool allowed_dt_conversion_fuse =
+                (input.is_type<one_hot>() || input.is_type<permute>() || input.is_type<mvn>() ||
+                 input.is_type<concatenation>() || input.is_type<depth_to_space>() || input.is_type<region_yolo>() ||
+                 input.is_type<detection_output>() || input.is_type<gather>() || input.is_type<broadcast>() ||
+                 input.is_type<select>() || input.is_type<eltwise>());
             if (!same_data_type && !allowed_dt_conversion_fuse)
                 continue;
 
@@ -426,8 +433,10 @@ void remove_redundant_reorders::run(program& p) {
             auto old_output_layout_of_input = input.get_output_layout();
             input.set_output_layout(output_layout, false);
             if (input.type()->does_possible_implementation_exist(input)) {
-                // Add fused_primitive_desc of reorder to the previous node which propagates original output layout during shape inference
-                if (input.is_type<mvn>() || input.is_type<concatenation>()) {
+                // Add fused_primitive_desc of reorder to the previous node which propagates original output layout
+                // during shape inference
+                if (input.is_type<mvn>() || input.is_type<concatenation>() || input.is_type<gather>() ||
+                    input.is_type<broadcast>() || input.is_type<select>() || input.is_type<eltwise>()) {
                     fused_primitive_desc local_desc(node.get_primitive());
                     local_desc.f_param = node.get_fuse_params();
                     local_desc.total_num_deps = node.get_dependencies().size();

diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -13,6 +13,7 @@
 #include "reshape_inst.h"
 #include "arg_max_min_inst.h"
 #include "shape_of_inst.h"
+#include "select_inst.h"
 #include "condition_inst.h"
 #include "strided_slice_inst.h"
 #include <sstream>
@@ -33,6 +34,7 @@
 #include "prior_box_inst.h"
 #include "scatter_nd_update_inst.h"
 #include "gather_inst.h"
+#include "broadcast_inst.h"
 #include "loop_inst.h"
 #include "dft_inst.h"
 #include "to_string_utils.h"
@@ -428,10 +430,18 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, reorder_node
     bool allow_new_shape_infer = node.get_program().is_new_shape_infer();
     // Because mvn and concatenation kernel can work cross-layout, if reorder only performs type conversion,
     // fusing reorder to the previous node can be done even if it is a dynamic shape case
-    if ((prev.is_type<mvn>() || prev.is_type<concatenation>()) &&
+    if ((prev.is_type<mvn>() || prev.is_type<concatenation>() || prev.is_type<gather>() || prev.is_type<broadcast>() ||
+         prev.is_type<select>() || prev.is_type<eltwise>()) &&
         !prev.is_in_shape_of_subgraph() && node.is_type_conversion_only() &&
-        (format::is_simple_data_format(fmt_prev) && format::is_simple_data_format(fmt_next)))
+        (format::is_simple_data_format(fmt_prev) && format::is_simple_data_format(fmt_next)) &&
+        // If the prev node is backedge of the loop, the type will be changed by fusing reorder.
+        // We can void only that case if we can check whether the current node is backedge of the network.
+        // However no such handle is existing yet. (To be done in the future when we need to optimize out the type converting
+        // reorders in the body network)
+        !node.get_program().is_body_program() &&
+        prev.get_preferred_impl_type() != cldnn::impl_types::cpu) {
         return true;
+    }
 
     if (prev.is_dynamic() || (!node.get_users().empty() && node.get_users().front()->is_dynamic()))
         return false;

diff --git a/src/plugins/intel_gpu/src/graph/select.cpp b/src/plugins/intel_gpu/src/graph/select.cpp
@@ -39,6 +39,9 @@ std::vector<layout> select_inst::calc_output_layouts(const select_node& /*node*/
 
     auto desc = impl_param.typed_desc<select>();
     auto dt = desc->output_data_types[0].value_or(input1_layout.data_type);
+    if (impl_param.has_fused_primitives()) {
+        dt = impl_param.get_output_element_type();
+    }
 
     ov::op::v1::Select op;
     op.set_auto_broadcast(desc->broadcast_spec);

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl
@@ -253,6 +253,7 @@ inline uint FUNC(get_idx_pos)(OPTIONAL_SHAPE_INFO_ARG uint out_b, uint out_f, ui
 #define VLOAD CAT(vload, VEC_SIZE)
 #define VSTORE CAT(vstore,VEC_SIZE)
 #define INPUT0_VTYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE)
+#define OUTPUT_VTYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
 
 KERNEL(broadcast_gpu_ref)(
     OPTIONAL_SHAPE_INFO_ARG
@@ -322,15 +323,18 @@ KERNEL(broadcast_gpu_ref)(
             uint output_idx = out_pos;
             unroll_for(uint j = 0; j < y_nums; j++) {
                 unroll_for(uint i = 0; i < x_stride; i++) {
-                    output[output_idx + i] = input[idx_pos + i];
+                    output[output_idx + i] = TO_OUTPUT_TYPE(input[idx_pos + i]);
                 }
                 output_idx += OUTPUT_SIZE_X;
             }
         } else {
             uint output_idx = out_pos;
             INPUT0_VTYPE input_vec = VLOAD(0, &input[idx_pos]);
             unroll_for(uint i = 0; i < y_nums; i++) {
-                VSTORE(input_vec, 0, &output[output_idx]);
+                OUTPUT_VTYPE out_v;
+                for (int j = 0; j < VEC_SIZE; ++j)
+                    out_v[j] = TO_OUTPUT_TYPE(input_vec[j]);
+                VSTORE(out_v, 0, &output[output_idx]);
                 output_idx += OUTPUT_SIZE_X;
             }
 
@@ -339,7 +343,7 @@ KERNEL(broadcast_gpu_ref)(
 
                 output_idx = out_pos;
                 unroll_for(uint i = 0; i < y_nums; i++) {
-                    output[output_idx + x_stride] = input_val;
+                    output[output_idx + x_stride] = TO_OUTPUT_TYPE(input_val);
                     output_idx += OUTPUT_SIZE_X;
                 }
             }
@@ -375,7 +379,7 @@ KERNEL(broadcast_gpu_ref)(
         const uint out_pos = OUTPUT_GET_INDEX(out_b, out_f, out_y, out_x);
         const uint idx_pos = FUNC_CALL(get_idx_pos)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_y, out_x);
 #endif
-        output[out_pos] = input[idx_pos];
+        output[out_pos] = TO_OUTPUT_TYPE(input[idx_pos]);
     }
 }
 

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl
@@ -192,6 +192,6 @@ KERNEL(eltwise)(
 #if QUANTIZATION_TERM && !OUTPUT_IS_FP
     output[output_offset] = TO_OUTPUT_TYPE_SAT(ACTIVATION(out, ACTIVATION_PARAMS));
 #else
-    output[output_offset] = ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED);
+    output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED));
 #endif
 }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl
@@ -45,7 +45,7 @@ KERNEL(select)(
     uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
 #endif
 
-    const OUTPUT_TYPE res = select(INPUT_2, INPUT_1, MASK);
+    const OUTPUT_TYPE res = TO_OUTPUT_TYPE(select(INPUT_2, INPUT_1, MASK));
 
     output[output_offset] = res;
 }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.cpp
@@ -25,6 +25,7 @@ ParamsKey BroadcastKernelRef::GetSupportedKey() const {
     k.EnableAllInputLayout();
     k.EnableAllOutputLayout();
 
+    k.EnableDifferentTypes();
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h
@@ -14,5 +14,8 @@ class BroadcastKernelRef : public BroadcastKernelBase {
     KernelsData GetKernelsData(const Params& params) const override;
     KernelsPriority GetKernelsPriority(const Params& params) const override;
     ParamsKey GetSupportedKey() const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::REORDER };
+    }
 };
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_ref.h
@@ -19,7 +19,8 @@ class EltwiseKernelRef : public EltwiseKernelBase {
         return {
             FusedOpType::QUANTIZE,
             FusedOpType::ACTIVATION,
-            FusedOpType::ELTWISE
+            FusedOpType::ELTWISE,
+            FusedOpType::REORDER
         };
     }
 

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.h
@@ -37,7 +37,8 @@ class GatherKernelRef : public KernelBaseOpenCL {
     std::vector<FusedOpType> GetSupportedFusedOps() const override {
         return { FusedOpType::QUANTIZE,
                  FusedOpType::ELTWISE,
-                 FusedOpType::ACTIVATION };
+                 FusedOpType::ACTIVATION,
+                 FusedOpType::REORDER };
     }
 
 protected:

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.h
@@ -15,6 +15,10 @@ class SelectKernelRef : public SelectKernelBase {
     KernelsData GetKernelsData(const Params& params) const override;
     KernelsPriority GetKernelsPriority(const Params& params) const override;
     ParamsKey GetSupportedKey() const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::REORDER };
+    }
+
 
 protected:
     bool Validate(const Params& p) const override;

diff --git a/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp
@@ -30,7 +30,7 @@ struct eltwise_test_params {
 
 class EltwiseFusingTest : public ::BaseFusingTest<eltwise_test_params> {
 public:
-    void execute(eltwise_test_params& p) {
+    void execute(eltwise_test_params& p, bool count_reorder = false) {
         auto input_prim = get_mem(get_input_layout(p));
         auto input_prim2 = get_mem(get_input_layout2(p));
 
@@ -45,7 +45,7 @@ class EltwiseFusingTest : public ::BaseFusingTest<eltwise_test_params> {
             network_not_fused.set_input_data("input2", input_prim2);
         }
 
-        compare(network_not_fused, network_fused, p);
+        compare(network_not_fused, network_fused, p, count_reorder);
     }
 
     layout get_input_layout(eltwise_test_params& p) {
@@ -545,21 +545,27 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_no_pitches_same_dims_quantize, ::t
     eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 },
 }));
 
-class eltwise_activation : public EltwiseFusingTest {};
-TEST_P(eltwise_activation, basic) {
+class eltwise_activation_reorder : public EltwiseFusingTest {};
+TEST_P(eltwise_activation_reorder, basic) {
     auto p = GetParam();
-    create_topologies(
-        input_layout("input", get_input_layout(p)),
-        input_layout("input2", get_input_layout2(p)),
-        eltwise("eltwise", { input_info("input"), input_info("input2") }, p.mode, p.default_type),
-        activation("activation", input_info("eltwise"), activation_func::relu, { 6.0f, 0.0f }),
-        reorder("out", input_info("activation"), p.default_format, data_types::f32)
-    );
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      input_layout("input2", get_input_layout2(p)),
+                      eltwise("eltwise", {input_info("input"), input_info("input2")}, p.mode, p.default_type),
+                      activation("activation", input_info("eltwise"), activation_func::relu, {6.0f, 0.0f}),
+                      reorder("out",
+                              input_info("activation"),
+                              p.default_format,
+                              data_types::f32,
+                              std::vector<float>(),
+                              cldnn::reorder_mean_mode::subtract,
+                              cldnn::padding(),
+                              true));
 
     tolerance = default_tolerance(p.input_type);
-    execute(p);
+    execute(p, true);
 }
 
+class eltwise_activation : public EltwiseFusingTest {};
 TEST_P(eltwise_activation, fp16_out) {
     auto p = GetParam();
     create_topologies(
@@ -574,6 +580,21 @@ TEST_P(eltwise_activation, fp16_out) {
     execute(p);
 }
 
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation_reorder, ::testing::ValuesIn(std::vector<eltwise_test_params>{
+    eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP16_3, 4, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 4, 5 }
+}));
+
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(std::vector<eltwise_test_params>{
     eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 },
     eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 },
@@ -590,6 +611,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(st
 }));
 
 
+
 class eltwise_quantize_fs_b_yx_fsv32 : public EltwiseFusingTest {};
 TEST_P(eltwise_quantize_fs_b_yx_fsv32, fusing_eltwise_quantize_layout) {
     auto p = GetParam();
@@ -649,4 +671,4 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_quantize_fs_b_yx_fsv32_exception,
     eltwise_test_params{ CASE_ELTWISE_FP16_B_FS_YX, 6, 6 },
     eltwise_test_params{ CASE_ELTWISE_FP16_BATCH_FS_B, 6, 6 },
     eltwise_test_params{ CASE_ELTWISE_FP16_BATCH_B_FS, 6, 6 },
-}));
+}));
diff --git a/src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp
@@ -32,7 +32,7 @@ struct gather_test_params {
 
 class GatherPrimitiveFusingTest : public ::BaseFusingTest<gather_test_params> {
 public:
-    void execute(gather_test_params& p, bool is_dynamic = false) {
+    void execute(gather_test_params& p, bool is_dynamic = false, bool count_reorder = false) {
         cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
         cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
         cfg_fused.set_property(ov::intel_gpu::optimize_data(true));
@@ -50,7 +50,7 @@ class GatherPrimitiveFusingTest : public ::BaseFusingTest<gather_test_params> {
             network_not_fused.set_input_data("eltwise_data", elt_input_prim);
         }
 
-        compare(network_not_fused, network_fused, p);
+        compare(network_not_fused, network_fused, p, count_reorder);
     }
 
     layout get_input_layout(gather_test_params& p, bool is_dynamic = false) {
@@ -119,6 +119,8 @@ class GatherPrimitiveFusingTest : public ::BaseFusingTest<gather_test_params> {
 #define CASE_GATHER_5D_FP16_4 { 3, 2, 2, 2, 2 }, { 2, 3, 1, 1 }, { 3, 2, 2, 3, 2 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx
 #define CASE_GATHER_5D_FP16_5 { 1, 1, 2, 1, 1 }, { 3, 1, 1, 1 }, { 1, 1, 1, 1, 3 }, 4, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx
 
+#define CASE_GATHER_INT8_1    { 2, 3, 4, 1 }, { 4 }, { 4, 3, 4, 1 }, 0, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+
 class gather_quantize : public GatherPrimitiveFusingTest {};
 TEST_P(gather_quantize, basic) {
     auto p = GetParam();
@@ -223,14 +225,15 @@ TEST_P(gather_eltwise_activation_dynamic, basic) {
         gather("gather_prim", input_info("input"), input_info("gather_indices"), p.axis, p.dictionary_shape.size(), p.out_shape),
         activation("activation", input_info("gather_prim"), activation_func::abs),
         eltwise("eltwise", { input_info("activation"), input_info("eltwise_data") }, eltwise_mode::prod),
-        reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
+        reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32, std::vector<float>(), cldnn::reorder_mean_mode::subtract, cldnn::padding(), true)
     );
 
     tolerance = 1e-5f;
-    execute(p, true);
+    execute(p, true, true);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_eltwise_activation_dynamic, ::testing::ValuesIn(std::vector<gather_test_params>{
     gather_test_params{ CASE_GATHER_FP32_6, 4, 6 },
-    gather_test_params{ CASE_GATHER_FP16_6, 4, 6 },
-    gather_test_params{ CASE_GATHER_FP16_7, 4, 6 },
+    gather_test_params{ CASE_GATHER_FP16_6, 4, 7 },
+    gather_test_params{ CASE_GATHER_FP16_7, 5, 8 },
+    gather_test_params{ CASE_GATHER_INT8_1, 4, 7 },
 }));