diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp index e21d857171238b..862765a84326fa 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp @@ -75,6 +75,23 @@ void add_required_reorders::run(program& p) { } } + if (usr->is_type() && usr->is_in_shape_of_subgraph()) { + for (size_t i = 0; i < usr->get_dependencies().size(); i++) { + auto& dep = usr->get_dependency(i); + if (!dep.is_in_data_flow() || dep.is_constant()) + continue; + auto dep_layout = dep.get_output_layout(); + auto out_layout = usr->get_output_layout(); + bool required_reorder = out_layout.data_type != dep_layout.data_type; + if (required_reorder) { + auto new_reorder = std::make_shared(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, out_layout.data_type); + auto& new_reorder_node = p.get_or_create(new_reorder); + p.add_intermediate(new_reorder_node, *usr, dep); + new_reorder_node.recalc_output_layout(false); + } + } + } + if (optimize_data) { auto fused_ops = usr->get_fused_primitives(); auto out_layout = usr->get_output_layout(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 09654d2265ff91..9dfa04e708a7d7 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -67,7 +67,7 @@ bool concat_in_place_optimization::match(const program_node& concat_node, kernel_impl_params concat_params, std::vector pred_params, bool is_runtime) { - if (concat_node.is_output() || concat_params.fused_desc.size() > 0) + if (concat_node.is_output() || concat_params.fused_desc.size() > 0 || concat_node.is_in_shape_of_subgraph()) return false; auto pred_nodes = concat_node.get_dependencies(); for (auto p : pred_nodes) { diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 6ca6a654d6ab18..72fffba694c2b7 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -826,11 +826,11 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { if (input_data.in_shape_of_subgraph || node->in_shape_of_subgraph) return; - auto& input_lo = quantize_node.get_dependency(1); - auto& input_hi = quantize_node.get_dependency(2); - auto out_layout = quantize_node.get_output_layout(); auto in_layout = input_data.get_output_layout(); + if (in_layout.is_dynamic() || out_layout.is_dynamic()) + return; + auto out_dt = out_layout.data_type; auto in_dt = input_data.get_input_layout(0).data_type; auto out_dt_is_i8_u8 = data_type_traits::is_i8_u8(out_dt); @@ -844,6 +844,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { quantize_node.get_per_tensor_output_shift() && quantize_node.get_per_tensor_output_range(); + auto& input_lo = quantize_node.get_dependency(1); + auto& input_hi = quantize_node.get_dependency(2); bool should_fuse = input_data.is_type() && ((out_dt == data_types::bin && quantize_node.get_dependencies().size() == 5 && diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 8bd5dbb74d8209..92e0044f92337a 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -449,7 +449,8 @@ void remove_redundant_reorders::run(program& p) { itr = p.get_processing_order().begin(); while (itr != p.get_processing_order().end()) { auto& node_ptr = *itr++; - if (!node_ptr->is_type() || !node_ptr->is_in_data_flow() || node_ptr->get_users().size() != 1 || node_ptr->get_dependencies().size() != 1) + if (!node_ptr->is_type() || !node_ptr->is_in_data_flow() || node_ptr->get_users().size() != 1 || + node_ptr->get_dependencies().size() != 1 || node_ptr->is_dynamic()) continue; auto& node = node_ptr->as(); diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 925b7e811ecabd..d31704f565816e 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -418,8 +418,8 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, reorder_node // Because mvn and concatenation kernel can work cross-layout, if reorder only performs type conversion, // fusing reorder to the previous node can be done even if it is a dynamic shape case if ((prev.is_type() || prev.is_type()) && - (format::is_simple_data_format(fmt_prev) && format::is_simple_data_format(fmt_next)) && - node.is_type_conversion_only()) + !prev.is_in_shape_of_subgraph() && node.is_type_conversion_only() && + (format::is_simple_data_format(fmt_prev) && format::is_simple_data_format(fmt_next))) return true; if (prev.is_dynamic() || (!node.get_users().empty() && node.get_users().front()->is_dynamic())) diff --git a/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp new file mode 100644 index 00000000000000..f83ba709fac55c --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include "intel_gpu/runtime/engine.hpp" + +#include "intel_gpu/graph/network.hpp" +#include "intel_gpu/graph/program.hpp" +#include "data_inst.h" +#include "shape_of_inst.h" +#include "gather_inst.h" +#include "eltwise_inst.h" +#include "reshape_inst.h" +#include "concatenation_inst.h" +#include "pass_manager.h" +#include "to_string_utils.h" + +#include "program_wrapper.h" + +#include + +using namespace cldnn; +using namespace ::tests; + +TEST(add_required_reorders, input_reorder_inside_shape_of_subgraph) { + auto& engine = get_test_engine(); + auto input_layout_dynamic = layout{ov::PartialShape{1, 32, ov::Dimension::dynamic(), ov::Dimension::dynamic()}, + data_types::f16, format::bfyx}; + auto input = engine.allocate_memory({ov::PartialShape{1, 32, 32, 32}, data_types::f16, format::bfyx}); + auto data_0 = engine.allocate_memory({ ov::PartialShape{}, data_types::i32, format::bfyx }); + auto data_1 = engine.allocate_memory({ ov::PartialShape{}, data_types::f32, format::bfyx }); + + topology topology; + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(data("data_0", data_0)); + topology.add(data("data_1", data_1)); + topology.add(shape_of("shape_of", input_info("input"), 4, data_types::i32)); + topology.add(gather("gather0", input_info("shape_of"), input_info("data_0"), 0, {}, 0, true)); + topology.add(eltwise("eltwise0", {input_info("gather0"), input_info("data_1")}, eltwise_mode::prod, data_types::f32)); + topology.add(reshape("reshape0", input_info("eltwise0"), false, {}, + ov::PartialShape{1}, reshape::reshape_mode::unsqueeze)); + topology.add(gather("gather1", input_info("shape_of"), input_info("data_0"), 0, {}, 0, true)); + topology.add(eltwise("eltwise1", {input_info("gather1"), input_info("data_1")}, eltwise_mode::prod, data_types::f32)); + topology.add(reshape("reshape1", input_info("eltwise1"), false, {}, + ov::PartialShape{1}, reshape::reshape_mode::unsqueeze)); + topology.add(concatenation("concat0", {input_info("reshape0"), input_info("reshape1")}, 0, data_types::f32)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + network.set_input_data("input", input); + + network.execute(); + + auto prog = network.get_program(); + ASSERT_NE(prog, nullptr); + + auto& eltwise_node = prog->get_node("eltwise0"); + auto eltwise_in_layout = eltwise_node.get_input_layout(); + + ASSERT_EQ(eltwise_in_layout.data_type, data_types::f32); +} diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp index 9eaa26b303abdd..fc1a4775378637 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp @@ -13,6 +13,8 @@ #include "fully_connected_inst.h" #include "permute_inst.h" #include "reorder_inst.h" +#include "shape_of_inst.h" +#include "gather_inst.h" #include "intel_gpu/graph/network.hpp" #include "pass_manager.h" #include "to_string_utils.h" @@ -520,6 +522,61 @@ TEST(prepare_buffer_fusing, crop_b_axis) { } } +TEST(prepare_buffer_fusing, skip_in_place_concat_inside_shape_of_subgraph) { + auto& engine = get_test_engine(); + auto input_layout_dynamic = layout{ov::PartialShape{1, 32, ov::Dimension::dynamic(), ov::Dimension::dynamic()}, + data_types::f16, format::bfyx}; + auto input = engine.allocate_memory({ov::PartialShape{1, 32, 32, 32}, data_types::f16, format::bfyx}); + auto data_0 = engine.allocate_memory({ ov::PartialShape{}, data_types::i32, format::bfyx }); + auto data_1 = engine.allocate_memory({ ov::PartialShape{}, data_types::f32, format::bfyx }); + auto data_2 = engine.allocate_memory({ ov::PartialShape{4}, data_types::i32, format::bfyx }); + + const ov::op::AutoBroadcastSpec& broadcast_spec = ov::op::AutoBroadcastSpec(ov::op::AutoBroadcastType::NUMPY); + + topology topology; + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(data("data_0", data_0)); + topology.add(data("data_1", data_1)); + topology.add(data("data_2", data_2)); + topology.add(shape_of("shape_of", input_info("input"), 4, data_types::i32)); + topology.add(gather("gather0", input_info("shape_of"), input_info("data_0"), 0, {}, 0, true)); + topology.add(reorder("reorder0", input_info("gather0"), format::any, data_types::f32, + std::vector(), reorder_mean_mode::subtract, padding(), true)); + topology.add(eltwise("eltwise0", input_info("reorder0"), input_info("data_1"), eltwise_mode::prod, broadcast_spec)); + topology.add(reshape("reshape0", input_info("eltwise0"), false, {}, + ov::PartialShape{1}, reshape::reshape_mode::unsqueeze)); + topology.add(gather("gather1", input_info("shape_of"), input_info("data_0"), 0, {}, 0, true)); + topology.add(reorder("reorder1", input_info("gather1"), format::any, data_types::f32, + std::vector(), reorder_mean_mode::subtract, padding(), true)); + topology.add(eltwise("eltwise1", input_info("reorder1"), input_info("data_1"), eltwise_mode::prod, broadcast_spec)); + topology.add(reshape("reshape1", input_info("eltwise1"), false, {}, + ov::PartialShape{1}, reshape::reshape_mode::unsqueeze)); + topology.add(crop("crop", input_info("shape_of"), tensor({2,1,1,1,1,1,1,1,1}), tensor({0,0,0,0,1,1,1,1,1}))); + topology.add(concatenation("concat0", {input_info("reshape0"), input_info("reshape1")}, 0, data_types::f32)); + topology.add(reorder("reorder3", input_info("concat0"), format::any, data_types::i32, + std::vector(), reorder_mean_mode::subtract, padding(), true)); + topology.add(concatenation("concat1", {input_info("reorder3"), input_info("crop")}, 0, data_types::i32)); + topology.add(eltwise("eltwise2", input_info("concat1"), input_info("data_2"), eltwise_mode::prod, broadcast_spec)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + network.set_input_data("input", input); + + network.execute(); + + auto prog = network.get_program(); + ASSERT_NE(prog, nullptr); + + auto& crop_node = prog->get_node("crop"); + auto impl_param = crop_node.get_kernel_impl_params(); + auto crop_mem = network.get_output_memory("crop"); + ASSERT_EQ(impl_param->get_output_layout(), crop_mem->get_layout()); + auto in_place = engine.is_the_same_buffer(*network.get_output_memory("crop"), *network.get_output_memory("concat1")); + ASSERT_FALSE(in_place); +} + #ifdef ENABLE_ONEDNN_FOR_GPU TEST(prepare_buffer_fusing, in_place_onednn_concat_static) { auto& engine = get_test_engine(); diff --git a/src/plugins/intel_gpu/tests/unit/passes/remove_redundant_reorders_tests.cpp b/src/plugins/intel_gpu/tests/unit/passes/remove_redundant_reorders_tests.cpp index 95a0aa05f724be..28a1e670c698d6 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/remove_redundant_reorders_tests.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/remove_redundant_reorders_tests.cpp @@ -19,6 +19,8 @@ #include "activation_inst.h" #include "mvn_inst.h" #include "concatenation_inst.h" +#include "shape_of_inst.h" +#include "gather_inst.h" #include "pass_manager.h" #include "to_string_utils.h" @@ -323,3 +325,55 @@ TEST(remove_redundant_reorders, fuse_reorder_to_prev_concat_dyn) { ASSERT_EQ(concat_layout.data_type, data_types::f32); } + +TEST(remove_redundant_reorders, not_to_fuse_concat_with_reorder_inside_shape_of_subgraph) { + auto& engine = get_test_engine(); + auto input_layout_dynamic = layout{ov::PartialShape{1, 32, ov::Dimension::dynamic(), ov::Dimension::dynamic()}, + data_types::f16, format::bfyx}; + auto input = engine.allocate_memory({ov::PartialShape{1, 32, 32, 32}, data_types::f16, format::bfyx}); + auto data_0 = engine.allocate_memory({ ov::PartialShape{}, data_types::i32, format::bfyx }); + auto data_1 = engine.allocate_memory({ ov::PartialShape{}, data_types::f32, format::bfyx }); + auto data_2 = engine.allocate_memory({ ov::PartialShape{2}, data_types::i32, format::bfyx }); + + const ov::op::AutoBroadcastSpec& broadcast_spec = ov::op::AutoBroadcastSpec(ov::op::AutoBroadcastType::NUMPY); + + topology topology; + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(data("data_0", data_0)); + topology.add(data("data_1", data_1)); + topology.add(data("data_2", data_2)); + topology.add(shape_of("shape_of", input_info("input"), 4, data_types::i32)); + topology.add(gather("gather0", input_info("shape_of"), input_info("data_0"), 0, {}, 0, true)); + topology.add(reorder("reorder0", input_info("gather0"), format::any, data_types::f32, + std::vector(), reorder_mean_mode::subtract, padding(), true)); + topology.add(eltwise("eltwise0", input_info("reorder0"), input_info("data_1"), eltwise_mode::prod, broadcast_spec)); + topology.add(reshape("reshape0", input_info("eltwise0"), false, {}, + ov::PartialShape{1}, reshape::reshape_mode::unsqueeze)); + topology.add(gather("gather1", input_info("shape_of"), input_info("data_0"), 0, {}, 0, true)); + topology.add(reorder("reorder1", input_info("gather1"), format::any, data_types::f32, + std::vector(), reorder_mean_mode::subtract, padding(), true)); + topology.add(eltwise("eltwise1", input_info("reorder1"), input_info("data_1"), eltwise_mode::prod, broadcast_spec)); + topology.add(reshape("reshape1", input_info("eltwise1"), false, {}, + ov::PartialShape{1}, reshape::reshape_mode::unsqueeze)); + topology.add(concatenation("concat0", {input_info("reshape0"), input_info("reshape1")}, 0, data_types::f32)); + topology.add(reorder("reorder3", input_info("concat0"), format::any, data_types::i32, + std::vector(), reorder_mean_mode::subtract, padding(), true)); + topology.add(concatenation("concat1", {input_info("reorder3"), input_info("data_2")}, 0, data_types::i32)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + network.set_input_data("input", input); + + network.execute(); + + auto prog = network.get_program(); + ASSERT_NE(prog, nullptr); + + ASSERT_TRUE(has_node(*prog, "reorder3")); + auto& concat_node = prog->get_node("concat0"); + auto concat_layout = concat_node.get_output_layout(); + + ASSERT_EQ(concat_layout.data_type, data_types::f32); +}