From be0c954fd212005006abffc15c50d267d9a04b94 Mon Sep 17 00:00:00 2001 From: Eddy Kim Date: Wed, 13 Mar 2024 11:18:06 +0900 Subject: [PATCH] [GPU] Skip broadcast when input and output shapes are identical (#23331) ### Details: - This PR makes some `Broadcast` layers to be skipped if input and output shapes are same. ### Tickets: - 135100 --- src/plugins/intel_gpu/src/graph/broadcast.cpp | 19 +++++++++ .../mark_runtime_skippable_nodes.cpp | 40 +++++++++++++++++++ .../src/graph/impls/ocl/primitive_base.hpp | 4 +- .../src/graph/include/broadcast_inst.h | 4 ++ .../src/graph/include/primitive_inst.h | 1 + .../intel_gpu/src/graph/primitive_inst.cpp | 29 +++++++++++++- 6 files changed, 95 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/broadcast.cpp b/src/plugins/intel_gpu/src/graph/broadcast.cpp index 9890b8c321d3f9..0617fd8a8561c9 100644 --- a/src/plugins/intel_gpu/src/graph/broadcast.cpp +++ b/src/plugins/intel_gpu/src/graph/broadcast.cpp @@ -126,6 +126,25 @@ std::string broadcast_inst::to_string(broadcast_node const& node) { return primitive_description.str(); } +void broadcast_inst::on_execute() { + update_output_memory(); +} + +void broadcast_inst::update_output_memory() { + if (!can_be_optimized()) + return; + if (static_cast(_outputs[0]) && _network.get_engine().is_the_same_buffer(output_memory(), input_memory())) + return; + + if (_node != nullptr) + build_deps(); + + GPU_DEBUG_TRACE_DETAIL << id() << " : update_output_memory with mem of input " << get_node().get_dependency(0).id() + << " : " << input_memory_ptr()->buffer_ptr() << std::endl; + _outputs[0] = input_memory_ptr(); + _mem_allocated = false; +} + broadcast_inst::typed_primitive_inst(network& network, broadcast_node const& node) : parent(network, node) { auto input_layout = node.get_input_layout(); if (input_layout.is_dynamic()) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp index d50affe4056452..da847d5d2504bc 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp @@ -8,6 +8,7 @@ #include "strided_slice_inst.h" #include "kv_cache_inst.h" #include "gemm_inst.h" +#include "broadcast_inst.h" #include "program_helpers.h" using namespace cldnn; @@ -95,5 +96,44 @@ void mark_runtime_skippable_nodes::run(program& p) { node.can_be_optimized(true); GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl; }); + program_helpers::do_for_types(*node, [](broadcast_node& node){ + auto impl_params = node.get_kernel_impl_params(); + if (node.is_output() + || node.has_fused_primitives() + || (impl_params->get_input_layout(0).format != impl_params->get_output_layout().format) + || (impl_params->get_input_layout(0).data_type != impl_params->get_output_layout().data_type)) + return; + + if (node.is_dynamic()) { + // If the user is reorder, it could be fused to broadcast in the remove_redundant_reorders pass. + // In this case, broadcast can not be optimized due to different input and output shapes. + if (node.have_user_with_type() && node.get_users().size() == 1) + return; + + // Check if the size of rank is different, or if one of static dimensions has different size + auto input_pshape = impl_params->get_input_layout(0).get_partial_shape(); + auto output_pshape = impl_params->get_output_layout().get_partial_shape(); + + if (input_pshape.rank().is_static() && output_pshape.rank().is_static()) { + if (input_pshape.size() != output_pshape.size()) + return; + + auto input_pdim = input_pshape.begin(); + auto output_pdim = output_pshape.begin(); + while (input_pdim != input_pshape.end()) { + if (input_pdim->is_static() && output_pdim->is_static()) { + if (input_pdim->get_max_length() != output_pdim->get_max_length()) + return; + } + + input_pdim++; + output_pdim++; + } + } + + node.can_be_optimized(true); + GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl; + } + }); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index d1ae3e7e48b4c1..95f3be018911f4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -21,6 +21,7 @@ #include "gather_inst.h" #include "permute_inst.h" #include "strided_slice_inst.h" +#include "broadcast_inst.h" #include #include @@ -88,7 +89,8 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { !((impl_param.is_type() || impl_param.is_type() || impl_param.is_type() || - impl_param.is_type()) && impl_param.is_dynamic())) { + impl_param.is_type() || + impl_param.is_type()) && impl_param.is_dynamic())) { return make_unique(kernel_selector::kernel_data{}); } auto kernel_params = ImplType::get_kernel_params(ImplType::static_canonicalize_shapes(impl_param)); diff --git a/src/plugins/intel_gpu/src/graph/include/broadcast_inst.h b/src/plugins/intel_gpu/src/graph/include/broadcast_inst.h index 9b339aadd5c221..4d6e44720e05da 100644 --- a/src/plugins/intel_gpu/src/graph/include/broadcast_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/broadcast_inst.h @@ -39,6 +39,10 @@ class typed_primitive_inst : public typed_primitive_inst_base; diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index 6426aa16463c3b..d3e07948bdbc7b 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -235,6 +235,7 @@ class primitive_inst { void do_runtime_skip_gather(); void do_runtime_skip_permute(); void do_runtime_skip_strided_slice(); + void do_runtime_skip_broadcast(); void do_runtime_in_place_concat(); void do_runtime_in_place_kv_cache(); void configure_shape_of_dependencies(); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 18e5e48b416807..a2514e62eb7d28 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -29,6 +29,7 @@ #include "kv_cache_inst.h" #include "condition_inst.h" #include "gather_inst.h" +#include "broadcast_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" #include "implementation_map.hpp" #include "graph_optimizer/prepare_buffer_fusing.h" @@ -538,7 +539,8 @@ event::ptr primitive_inst::realloc_if_needed() { } // Clear out memory if if was previously reused, but now primitive can't be optimized - if (_node->is_type() || _node->is_type() || _node->is_type() || _node->is_type() || _node->is_type()) { + if (_node->is_type() || _node->is_type() || _node->is_type() || _node->is_type() || + _node->is_type() || _node->is_type()) { if (can_be_optimized()) { _max_output_layout_count = _deps[0].first->_max_output_layout_count; GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized"); @@ -1156,6 +1158,30 @@ void primitive_inst::do_runtime_skip_strided_slice() { set_can_be_optimized(true); } +void primitive_inst::do_runtime_skip_broadcast() { + OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_broadcast: " + id())); + // Check pattern + if (!get_node().is_type() || !get_node().can_be_optimized()) + return; + + GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_broadcast] " << id() << " : check optimizability" << std::endl; + auto input_layout = _impl_params->get_input_layout(0); + auto output_layout = _impl_params->get_output_layout(); + + // Check runtime shape (need to reset can_be_optimized) + if (input_layout != output_layout) { + set_can_be_optimized(false); + GPU_DEBUG_TRACE_DETAIL << "--- Cannot optimize because input layout(" << input_layout.to_short_string() + << ") != output layout(" << output_layout.to_short_string() << ")" << std::endl; + return; + } + + GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_broadcast] " << id() << " : can_be_optimized" << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - Input layout : " << _impl_params->get_input_layout(0).to_short_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - Output layout : " << _impl_params->get_output_layout().to_short_string() << std::endl; + set_can_be_optimized(true); +} + void primitive_inst::do_runtime_in_place_concat() { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_in_place_concat: " + id())); GPU_DEBUG_GET_INSTANCE(debug_config); @@ -1280,6 +1306,7 @@ event::ptr primitive_inst::execute(const std::vector& events) { do_runtime_in_place_kv_cache(); do_runtime_skip_permute(); do_runtime_skip_strided_slice(); + do_runtime_skip_broadcast(); if (!is_valid_fusion()) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id()));