From 1a26e6fe875069aee182e60dc1a8e2940085a8d7 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Wed, 3 Jul 2024 02:29:38 +0000 Subject: [PATCH] [GPU] Fuse more 'type conversion only' reorders (#25270) ### Details: - Reorder which is only converting type can be fused to the prior node ### Tickets: - 144957 --- .../remove_redundant_reorders.cpp | 17 ++- .../intel_gpu/src/graph/layout_optimizer.cpp | 14 ++- src/plugins/intel_gpu/src/graph/select.cpp | 3 + .../cl_kernels/broadcast_gpu_ref.cl | 12 +- .../cl_kernels/generic_eltwise_ref.cl | 2 +- .../cl_kernels/select_gpu_ref.cl | 2 +- .../broadcast/broadcast_kernel_ref.cpp | 1 + .../kernels/broadcast/broadcast_kernel_ref.h | 3 + .../kernels/eltwise/eltwise_kernel_ref.h | 3 +- .../kernels/gather/gather_kernel_ref.h | 3 +- .../kernels/select/select_kernel_ref.h | 4 + .../unit/fusions/eltwise_fusion_test.cpp | 48 ++++++-- .../tests/unit/fusions/gather_fusion_test.cpp | 15 ++- .../tests/unit/fusions/select_fusion_test.cpp | 112 ++++++++++++++++++ .../unit/test_cases/broadcast_gpu_test.cpp | 72 +++++++---- 15 files changed, 252 insertions(+), 59 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/unit/fusions/select_fusion_test.cpp diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 3c669d2d7beb9e..534b6ec857943d 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -11,6 +11,10 @@ #include "convert_color_inst.h" #include "one_hot_inst.h" #include "shape_of_inst.h" +#include "gather_inst.h" +#include "select_inst.h" +#include "eltwise_inst.h" +#include "broadcast_inst.h" #include "permute_inst.h" #include "depth_to_space_inst.h" #include "concatenation_inst.h" @@ -410,8 +414,11 @@ void remove_redundant_reorders::run(program& p) { continue; bool same_data_type = input.get_output_layout().data_type == output_layout.data_type; - bool allowed_dt_conversion_fuse = (input.is_type() || input.is_type() || input.is_type() || input.is_type() || - input.is_type() || input.is_type() || input.is_type()); + bool allowed_dt_conversion_fuse = + (input.is_type() || input.is_type() || input.is_type() || + input.is_type() || input.is_type() || input.is_type() || + input.is_type() || input.is_type() || input.is_type() || + input.is_type() || input.is_type()) { fused_primitive_desc local_desc(node.get_primitive()); local_desc.f_param = node.get_fuse_params(); local_desc.total_num_deps = node.get_dependencies().size(); diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 1de1683444196f..8c5930e4595065 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -13,6 +13,7 @@ #include "reshape_inst.h" #include "arg_max_min_inst.h" #include "shape_of_inst.h" +#include "select_inst.h" #include "condition_inst.h" #include "strided_slice_inst.h" #include @@ -33,6 +34,7 @@ #include "prior_box_inst.h" #include "scatter_nd_update_inst.h" #include "gather_inst.h" +#include "broadcast_inst.h" #include "loop_inst.h" #include "dft_inst.h" #include "to_string_utils.h" @@ -428,10 +430,18 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, reorder_node bool allow_new_shape_infer = node.get_program().is_new_shape_infer(); // Because mvn and concatenation kernel can work cross-layout, if reorder only performs type conversion, // fusing reorder to the previous node can be done even if it is a dynamic shape case - if ((prev.is_type() || prev.is_type()) && + if ((prev.is_type() || prev.is_type() || prev.is_type() || prev.is_type() || + prev.is_type(); auto dt = desc->output_data_types[0].value_or(input1_layout.data_type); + if (impl_param.has_fused_primitives()) { + dt = impl_param.get_output_element_type(); + } ov::op::v1::Select op; op.set_auto_broadcast(desc->broadcast_spec); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl index 3f2314e1a38ed4..3195ffe2f3e023 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl @@ -253,6 +253,7 @@ inline uint FUNC(get_idx_pos)(OPTIONAL_SHAPE_INFO_ARG uint out_b, uint out_f, ui #define VLOAD CAT(vload, VEC_SIZE) #define VSTORE CAT(vstore,VEC_SIZE) #define INPUT0_VTYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) +#define OUTPUT_VTYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE) KERNEL(broadcast_gpu_ref)( OPTIONAL_SHAPE_INFO_ARG @@ -322,7 +323,7 @@ KERNEL(broadcast_gpu_ref)( uint output_idx = out_pos; unroll_for(uint j = 0; j < y_nums; j++) { unroll_for(uint i = 0; i < x_stride; i++) { - output[output_idx + i] = input[idx_pos + i]; + output[output_idx + i] = TO_OUTPUT_TYPE(input[idx_pos + i]); } output_idx += OUTPUT_SIZE_X; } @@ -330,7 +331,10 @@ KERNEL(broadcast_gpu_ref)( uint output_idx = out_pos; INPUT0_VTYPE input_vec = VLOAD(0, &input[idx_pos]); unroll_for(uint i = 0; i < y_nums; i++) { - VSTORE(input_vec, 0, &output[output_idx]); + OUTPUT_VTYPE out_v; + for (int j = 0; j < VEC_SIZE; ++j) + out_v[j] = TO_OUTPUT_TYPE(input_vec[j]); + VSTORE(out_v, 0, &output[output_idx]); output_idx += OUTPUT_SIZE_X; } @@ -339,7 +343,7 @@ KERNEL(broadcast_gpu_ref)( output_idx = out_pos; unroll_for(uint i = 0; i < y_nums; i++) { - output[output_idx + x_stride] = input_val; + output[output_idx + x_stride] = TO_OUTPUT_TYPE(input_val); output_idx += OUTPUT_SIZE_X; } } @@ -375,7 +379,7 @@ KERNEL(broadcast_gpu_ref)( const uint out_pos = OUTPUT_GET_INDEX(out_b, out_f, out_y, out_x); const uint idx_pos = FUNC_CALL(get_idx_pos)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_y, out_x); #endif - output[out_pos] = input[idx_pos]; + output[out_pos] = TO_OUTPUT_TYPE(input[idx_pos]); } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl index 17b25de5e6fbc6..ae6a02d7770d2e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl @@ -192,6 +192,6 @@ KERNEL(eltwise)( #if QUANTIZATION_TERM && !OUTPUT_IS_FP output[output_offset] = TO_OUTPUT_TYPE_SAT(ACTIVATION(out, ACTIVATION_PARAMS)); #else - output[output_offset] = ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED); + output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED)); #endif } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl index d56b5299ac287a..e5a269fedcd4bc 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl @@ -45,7 +45,7 @@ KERNEL(select)( uint output_offset = OUTPUT_GET_INDEX(b, f, y, x); #endif - const OUTPUT_TYPE res = select(INPUT_2, INPUT_1, MASK); + const OUTPUT_TYPE res = TO_OUTPUT_TYPE(select(INPUT_2, INPUT_1, MASK)); output[output_offset] = res; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.cpp index 4830c366bc9fbf..82182e37f73898 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.cpp @@ -25,6 +25,7 @@ ParamsKey BroadcastKernelRef::GetSupportedKey() const { k.EnableAllInputLayout(); k.EnableAllOutputLayout(); + k.EnableDifferentTypes(); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h index 50a61f4fcbccaf..8840e1ee0c03e2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h @@ -14,5 +14,8 @@ class BroadcastKernelRef : public BroadcastKernelBase { KernelsData GetKernelsData(const Params& params) const override; KernelsPriority GetKernelsPriority(const Params& params) const override; ParamsKey GetSupportedKey() const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::REORDER }; + } }; } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_ref.h index 25a1bca9fea598..5b9ef765be298a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_ref.h @@ -19,7 +19,8 @@ class EltwiseKernelRef : public EltwiseKernelBase { return { FusedOpType::QUANTIZE, FusedOpType::ACTIVATION, - FusedOpType::ELTWISE + FusedOpType::ELTWISE, + FusedOpType::REORDER }; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.h index 5893458dd6cf50..476f60eb8bef7d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.h @@ -37,7 +37,8 @@ class GatherKernelRef : public KernelBaseOpenCL { std::vector GetSupportedFusedOps() const override { return { FusedOpType::QUANTIZE, FusedOpType::ELTWISE, - FusedOpType::ACTIVATION }; + FusedOpType::ACTIVATION, + FusedOpType::REORDER }; } protected: diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.h index fbc1a0193da13c..76b357cde6decf 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.h @@ -15,6 +15,10 @@ class SelectKernelRef : public SelectKernelBase { KernelsData GetKernelsData(const Params& params) const override; KernelsPriority GetKernelsPriority(const Params& params) const override; ParamsKey GetSupportedKey() const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::REORDER }; + } + protected: bool Validate(const Params& p) const override; diff --git a/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp index 5218ffe7920538..883279ed369dd9 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp @@ -30,7 +30,7 @@ struct eltwise_test_params { class EltwiseFusingTest : public ::BaseFusingTest { public: - void execute(eltwise_test_params& p) { + void execute(eltwise_test_params& p, bool count_reorder = false) { auto input_prim = get_mem(get_input_layout(p)); auto input_prim2 = get_mem(get_input_layout2(p)); @@ -45,7 +45,7 @@ class EltwiseFusingTest : public ::BaseFusingTest { network_not_fused.set_input_data("input2", input_prim2); } - compare(network_not_fused, network_fused, p); + compare(network_not_fused, network_fused, p, count_reorder); } layout get_input_layout(eltwise_test_params& p) { @@ -545,21 +545,27 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_no_pitches_same_dims_quantize, ::t eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 }, })); -class eltwise_activation : public EltwiseFusingTest {}; -TEST_P(eltwise_activation, basic) { +class eltwise_activation_reorder : public EltwiseFusingTest {}; +TEST_P(eltwise_activation_reorder, basic) { auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - eltwise("eltwise", { input_info("input"), input_info("input2") }, p.mode, p.default_type), - activation("activation", input_info("eltwise"), activation_func::relu, { 6.0f, 0.0f }), - reorder("out", input_info("activation"), p.default_format, data_types::f32) - ); + create_topologies(input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + eltwise("eltwise", {input_info("input"), input_info("input2")}, p.mode, p.default_type), + activation("activation", input_info("eltwise"), activation_func::relu, {6.0f, 0.0f}), + reorder("out", + input_info("activation"), + p.default_format, + data_types::f32, + std::vector(), + cldnn::reorder_mean_mode::subtract, + cldnn::padding(), + true)); tolerance = default_tolerance(p.input_type); - execute(p); + execute(p, true); } +class eltwise_activation : public EltwiseFusingTest {}; TEST_P(eltwise_activation, fp16_out) { auto p = GetParam(); create_topologies( @@ -574,6 +580,21 @@ TEST_P(eltwise_activation, fp16_out) { execute(p); } +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation_reorder, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_3, 4, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 4, 5 } +})); + INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(std::vector{ eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 }, eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 }, @@ -590,6 +611,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(st })); + class eltwise_quantize_fs_b_yx_fsv32 : public EltwiseFusingTest {}; TEST_P(eltwise_quantize_fs_b_yx_fsv32, fusing_eltwise_quantize_layout) { auto p = GetParam(); @@ -649,4 +671,4 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_quantize_fs_b_yx_fsv32_exception, eltwise_test_params{ CASE_ELTWISE_FP16_B_FS_YX, 6, 6 }, eltwise_test_params{ CASE_ELTWISE_FP16_BATCH_FS_B, 6, 6 }, eltwise_test_params{ CASE_ELTWISE_FP16_BATCH_B_FS, 6, 6 }, -})); \ No newline at end of file +})); diff --git a/src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp index 22e8df735c7c30..2468a96e789a37 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp @@ -32,7 +32,7 @@ struct gather_test_params { class GatherPrimitiveFusingTest : public ::BaseFusingTest { public: - void execute(gather_test_params& p, bool is_dynamic = false) { + void execute(gather_test_params& p, bool is_dynamic = false, bool count_reorder = false) { cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); cfg_fused.set_property(ov::intel_gpu::optimize_data(true)); @@ -50,7 +50,7 @@ class GatherPrimitiveFusingTest : public ::BaseFusingTest { network_not_fused.set_input_data("eltwise_data", elt_input_prim); } - compare(network_not_fused, network_fused, p); + compare(network_not_fused, network_fused, p, count_reorder); } layout get_input_layout(gather_test_params& p, bool is_dynamic = false) { @@ -119,6 +119,8 @@ class GatherPrimitiveFusingTest : public ::BaseFusingTest { #define CASE_GATHER_5D_FP16_4 { 3, 2, 2, 2, 2 }, { 2, 3, 1, 1 }, { 3, 2, 2, 3, 2 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx #define CASE_GATHER_5D_FP16_5 { 1, 1, 2, 1, 1 }, { 3, 1, 1, 1 }, { 1, 1, 1, 1, 3 }, 4, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_GATHER_INT8_1 { 2, 3, 4, 1 }, { 4 }, { 4, 3, 4, 1 }, 0, data_types::i8, format::bfyx, data_types::f32, format::bfyx + class gather_quantize : public GatherPrimitiveFusingTest {}; TEST_P(gather_quantize, basic) { auto p = GetParam(); @@ -223,14 +225,15 @@ TEST_P(gather_eltwise_activation_dynamic, basic) { gather("gather_prim", input_info("input"), input_info("gather_indices"), p.axis, p.dictionary_shape.size(), p.out_shape), activation("activation", input_info("gather_prim"), activation_func::abs), eltwise("eltwise", { input_info("activation"), input_info("eltwise_data") }, eltwise_mode::prod), - reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) + reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32, std::vector(), cldnn::reorder_mean_mode::subtract, cldnn::padding(), true) ); tolerance = 1e-5f; - execute(p, true); + execute(p, true, true); } INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_eltwise_activation_dynamic, ::testing::ValuesIn(std::vector{ gather_test_params{ CASE_GATHER_FP32_6, 4, 6 }, - gather_test_params{ CASE_GATHER_FP16_6, 4, 6 }, - gather_test_params{ CASE_GATHER_FP16_7, 4, 6 }, + gather_test_params{ CASE_GATHER_FP16_6, 4, 7 }, + gather_test_params{ CASE_GATHER_FP16_7, 5, 8 }, + gather_test_params{ CASE_GATHER_INT8_1, 4, 7 }, })); diff --git a/src/plugins/intel_gpu/tests/unit/fusions/select_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/select_fusion_test.cpp new file mode 100644 index 00000000000000..0cc5843545ba44 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/fusions/select_fusion_test.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include "select_inst.h" + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct select_test_params { + ov::Shape input_shape; + ov::Shape mask_shape; + data_types input_type; + data_types output_type; + format input_format; + format output_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class SelectFusingTest : public ::BaseFusingTest { +public: + void execute(select_test_params& p, bool count_reorder = false) { + cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + cfg_fused.set_property(ov::intel_gpu::optimize_data(true)); + + tests::random_generator rg; + auto mask_mem = get_mem(get_mask_layout(p), 0, 1); + auto input1_mem = get_mem(get_input_layout(p)); + auto input2_mem = get_mem(get_input_layout(p)); + + network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused); + network network_fused(this->engine, this->topology_fused, cfg_fused); + + auto inputs = network_fused.get_input_ids(); + network_fused.set_input_data("mask", mask_mem); + network_fused.set_input_data("input1", input1_mem); + network_fused.set_input_data("input2", input2_mem); + network_not_fused.set_input_data("mask", mask_mem); + network_not_fused.set_input_data("input1", input1_mem); + network_not_fused.set_input_data("input2", input2_mem); + compare(network_not_fused, network_fused, p, count_reorder); + } + + layout get_input_layout(select_test_params& p) { + return layout{ p.input_shape, p.input_type, p.input_format }; + } + layout get_mask_layout(select_test_params& p) { + return layout{ p.mask_shape, data_types::i8, p.input_format }; + } +}; +} // namespace + +#define CASE_SELECT_FP32_TO_I8_0 {2, 16, 4, 4}, {2, 16, 4, 4}, data_types::f32, data_types::i8, format::bfyx, format::bfyx +#define CASE_SELECT_FP32_TO_F16_0 {2, 16, 17, 4}, {2, 16, 1, 4}, data_types::f32, data_types::f16, format::bfyx, format::bfyx +#define CASE_SELECT_FP16_TO_I8_0 {2, 16, 4, 4}, {2, 16, 4, 4}, data_types::f16, data_types::i8, format::bfyx, format::bfyx +#define CASE_SELECT_FP16_TO_I8_1 {2, 16, 4, 4}, {2, 16, 4, 4}, data_types::f16, data_types::i8, format::bfyx, format::bfzyx + +class select_reorder_fusion : public SelectFusingTest {}; +TEST_P(select_reorder_fusion, basic) { + auto p = GetParam(); + create_topologies( + cldnn::input_layout("mask", get_mask_layout(p)), + cldnn::input_layout("input1", get_input_layout(p)), + cldnn::input_layout("input2", get_input_layout(p)), + cldnn::reorder("mask_convert", input_info("mask"), p.input_format, p.input_type), + cldnn::select("select", input_info("input1"), input_info("input2"), input_info("mask_convert")), + cldnn::reorder("out", input_info("select"), p.output_format, p.output_type, std::vector(), cldnn::reorder_mean_mode::subtract, cldnn::padding(), true) + ); + tolerance = 1e-5f; + execute(p, true); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, select_reorder_fusion, ::testing::ValuesIn(std::vector{ + select_test_params{ CASE_SELECT_FP32_TO_F16_0, 5, 6}, + select_test_params{ CASE_SELECT_FP32_TO_I8_0, 5, 6}, + select_test_params{ CASE_SELECT_FP16_TO_I8_0, 5, 6}, + select_test_params{ CASE_SELECT_FP16_TO_I8_1, 6, 6}, // reorder should not be fused +})); + +class select_reorder_fusion_dynamic : public SelectFusingTest {}; +TEST_P(select_reorder_fusion_dynamic, basic) { + auto p = GetParam(); + create_topologies( + cldnn::input_layout("mask", layout{ ov::PartialShape::dynamic(p.mask_shape.size()), data_types::i8, p.input_format }), + cldnn::input_layout("input1", layout {ov::PartialShape::dynamic(p.input_shape.size()), p.input_type, p.input_format }), + cldnn::input_layout("input2", layout {ov::PartialShape::dynamic(p.input_shape.size()), p.input_type, p.input_format }), + cldnn::reorder("mask_convert", input_info("mask"), p.input_format, p.input_type), + cldnn::select("select", input_info("input1"), input_info("input2"), input_info("mask_convert")), + cldnn::reorder("out", input_info("select"), p.output_format, p.output_type, std::vector(), cldnn::reorder_mean_mode::subtract, cldnn::padding(), true) + ); + tolerance = 1e-5f; + execute(p, true); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, select_reorder_fusion_dynamic, ::testing::ValuesIn(std::vector{ + select_test_params{ CASE_SELECT_FP32_TO_F16_0, 5, 6}, + select_test_params{ CASE_SELECT_FP32_TO_I8_0, 5, 6}, + select_test_params{ CASE_SELECT_FP16_TO_I8_0, 5, 6}, + select_test_params{ CASE_SELECT_FP16_TO_I8_1, 6, 6}, // reorder should not be fused +})); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/broadcast_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/broadcast_gpu_test.cpp index d7c99f5ce60d58..184a5be5b1d81e 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/broadcast_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/broadcast_gpu_test.cpp @@ -86,32 +86,38 @@ void start_broadcast_test(format cldnn_format, data_types cldnn_data_type, std:: } } } -template +template void start_broadcast_test_dynamic(format input_format, data_types input_data_type, + data_types output_data_type, ov::Shape output_shape, ov::Shape input_data_shape, ov::AxisSet broadcast_axes, bool is_output_static = false, - impl_types impl_type = impl_types::any) { + impl_types impl_type = impl_types::any, + bool optimize = false) { size_t input_data_size = accumulate(input_data_shape.rbegin(), input_data_shape.rend(), (size_t)1, std::multiplies()); ASSERT_GE(input_data_size, (size_t)1); - std::vector input_data = {}; + std::vector input_data = {}; for (size_t i = 1; i <= input_data_size; ++i) { - input_data.push_back((T)i); + input_data.push_back((inT)i); } size_t output_data_size = accumulate(output_shape.rbegin(), output_shape.rend(), (size_t)1, std::multiplies()); ASSERT_GE(output_data_size, (size_t)1); - std::vector output_data(output_data_size); + std::vector output_data_tmp(output_data_size); + std::vector output_data; ov::reference::broadcast(reinterpret_cast(input_data.data()), - reinterpret_cast(output_data.data()), + reinterpret_cast(output_data_tmp.data()), ov::Shape(input_data_shape.begin(), input_data_shape.end()), ov::Shape(output_shape.begin(), output_shape.end()), ov::AxisSet(broadcast_axes), - sizeof(T)); + sizeof(inT)); - ASSERT_EQ(output_data.size(), accumulate(output_shape.rbegin(), output_shape.rend(), (size_t)1, std::multiplies())); + ASSERT_EQ(output_data_tmp.size(), accumulate(output_shape.rbegin(), output_shape.rend(), (size_t)1, std::multiplies())); + for (auto i : output_data_tmp) { + output_data.push_back((outT)i); + } int64_t input_rank = input_data_shape.size(); ASSERT_EQ(input_rank, broadcast_axes.size()); @@ -126,11 +132,15 @@ void start_broadcast_test_dynamic(format input_format, auto in_layout = layout(ov::PartialShape::dynamic(input_rank), input_data_type, fmt); topology.add(input_layout("input", in_layout)); topology.add(reorder("reorder", input_info("input"), input_format, input_data_type)); - topology.add(broadcast("broadcast", - input_info("reorder"), - output_shape, - ov::AxisSet(broadcast_axes))); - topology.add(reorder("output", input_info("broadcast"), fmt, input_data_type)); + topology.add(broadcast("broadcast", input_info("reorder"), output_shape, ov::AxisSet(broadcast_axes))); + topology.add(reorder("output", + input_info("broadcast"), + fmt, + output_data_type, + std::vector(), + cldnn::reorder_mean_mode::subtract, + cldnn::padding(), + true)); } else { auto in_layout = layout(ov::PartialShape::dynamic(input_rank), input_data_type, fmt); auto target_shape_layout = layout(ov::PartialShape{input_rank}, data_types::i32, fmt); @@ -140,7 +150,14 @@ void start_broadcast_test_dynamic(format input_format, topology.add(reorder("reorder", input_info("input"), input_format, input_data_type)); topology.add( broadcast("broadcast", input_info("reorder"), input_info("target_shape"), ov::AxisSet(broadcast_axes))); - topology.add(reorder("output", input_info("broadcast"), fmt, input_data_type)); + topology.add(reorder("output", + input_info("broadcast"), + fmt, + output_data_type, + std::vector(), + cldnn::reorder_mean_mode::subtract, + cldnn::padding(), + true)); std::vector target_shape_data; for (auto out_shape : output_shape) { target_shape_data.push_back(static_cast(out_shape)); @@ -150,6 +167,9 @@ void start_broadcast_test_dynamic(format input_format, ExecutionConfig config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + if (optimize) { + config.set_property(ov::intel_gpu::optimize_data(true)); + } const bool force_impl = impl_type != impl_types::any; if (force_impl) { @@ -168,7 +188,7 @@ void start_broadcast_test_dynamic(format input_format, // In case of impl forcing optimize_data property will set to true and additional // reorders optimization pass will be tiggered, so change expected primitive id - const auto prim_id = force_impl ? "output" : "broadcast"; + const auto prim_id = (force_impl || optimize) ? "output" : "broadcast"; auto inst = network.get_primitive(prim_id); auto impl = inst->get_impl(); ASSERT_TRUE(impl != nullptr); @@ -177,7 +197,7 @@ void start_broadcast_test_dynamic(format input_format, auto outputs = network.execute(); auto output = outputs.at("output").get_memory(); - cldnn::mem_lock output_ptr(output, get_test_stream()); + cldnn::mem_lock output_ptr(output, get_test_stream()); for (size_t i = 0; i < output_data_size; ++i) { ASSERT_EQ(output_ptr[i], output_data[i]); @@ -288,44 +308,44 @@ TEST(broadcast_gpu_int64_t, bfyx_1_to_4x5_w_b_axes_0x1) { // dynamic kernel TEST(broadcast_gpu_float, bfyx_1_to_4x5_w_b_axes_0x1_dynamic) { - start_broadcast_test_dynamic(format::bfyx, data_types::f32, {4, 5}, {1, 1}, {0, 1}); + start_broadcast_test_dynamic(format::bfyx, data_types::f32, data_types::f16, {4, 5}, {1, 1}, {0, 1}, false, impl_types::any, true); } TEST(broadcast_gpu_float, bfyx_1_to_4x5_w_b_axes_0x1_dynamic_with_static_output) { - start_broadcast_test_dynamic(format::bfyx, data_types::f32, {4, 5}, {1, 1}, {0, 1}, true); + start_broadcast_test_dynamic(format::bfyx, data_types::f32, data_types::f16, {4, 5}, {1, 1}, {0, 1}, true); } TEST(broadcast_gpu_uint8_t, bfyx_1_to_4x5_w_b_axes_0x1_dynamic) { - start_broadcast_test_dynamic(format::bfyx, data_types::u8, {4, 5}, {1, 1}, {0, 1}); + start_broadcast_test_dynamic(format::bfyx, data_types::u8, data_types::i32, {4, 5}, {1, 1}, {0, 1}); } TEST(broadcast_gpu_uint8_t, bfyx_1_to_4x5_w_b_axes_0x1x2_dynamic_with_static_output) { - start_broadcast_test_dynamic(format::bfyx, data_types::u8, {4, 5, 2}, {1, 1, 1}, {0, 1, 2}, true); + start_broadcast_test_dynamic(format::bfyx, data_types::u8, data_types::i32, {4, 5, 2}, {1, 1, 1}, {0, 1, 2}, true); } TEST(broadcast_gpu_int64_t, bfyx_1_to_4x5_w_b_axes_0x1_dynamic) { - start_broadcast_test_dynamic(format::bfyx, data_types::i64, {4, 5}, {1, 1}, {0, 1}); + start_broadcast_test_dynamic(format::bfyx, data_types::i64, data_types::i32, {4, 5}, {1, 1}, {0, 1}, false, impl_types::any, true); } TEST(broadcast_gpu_int64_t, bfyx_1_to_4x5_w_b_axes_0x1x2x3_dynamic_with_static_output) { - start_broadcast_test_dynamic(format::bfyx, data_types::i64, {4, 5, 2, 3}, {1, 1, 1, 1}, {0, 1, 2, 3}); + start_broadcast_test_dynamic(format::bfyx, data_types::i64, data_types::i32, {4, 5, 2, 3}, {1, 1, 1, 1}, {0, 1, 2, 3}); } // dynamic kernel cpu TEST(broadcast_cpu_impl_float, bfyx_1_to_4x5_w_b_axes_0x1_dynamic) { - start_broadcast_test_dynamic(format::bfyx, data_types::f32, {4, 5}, {1, 1}, {0, 1}, false, impl_types::cpu); + start_broadcast_test_dynamic(format::bfyx, data_types::f32, data_types::i32, {4, 5}, {1, 1}, {0, 1}, false, impl_types::cpu); } TEST(broadcast_cpu_impl_float, bfyx_1_to_4x5_w_b_axes_0x1_dynamic_with_static_output) { - start_broadcast_test_dynamic(format::bfyx, data_types::f32, {4, 5}, {1, 1}, {0, 1}, true, impl_types::cpu); + start_broadcast_test_dynamic(format::bfyx, data_types::f32, data_types::f32, {4, 5}, {1, 1}, {0, 1}, true, impl_types::cpu); } TEST(broadcast_cpu_impl_int64_t, bfyx_1_to_4x5_w_b_axes_0x1_dynamic) { - start_broadcast_test_dynamic(format::bfyx, data_types::i64, {4, 5}, {1, 1}, {0, 1}, false, impl_types::cpu); + start_broadcast_test_dynamic(format::bfyx, data_types::i64, data_types::i64, {4, 5}, {1, 1}, {0, 1}, false, impl_types::cpu); } TEST(broadcast_cpu_impl_int64_t, bfyx_1_to_4x5_w_b_axes_0x1x2x3_dynamic_with_static_output) { - start_broadcast_test_dynamic(format::bfyx, data_types::i64, {4, 5, 2, 3}, {1, 1, 1, 1}, {0, 1, 2, 3}, false, impl_types::cpu); + start_broadcast_test_dynamic(format::bfyx, data_types::i64, data_types::i32, {4, 5, 2, 3}, {1, 1, 1, 1}, {0, 1, 2, 3}, false, impl_types::cpu); } /* Expected golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,