Skip to content

Commit

Permalink
[GPU] Skip broadcast when input and output shapes are identical (open…
Browse files Browse the repository at this point in the history
…vinotoolkit#23331)

### Details:
- This PR makes some `Broadcast` layers to be skipped if input and
output shapes are same.

### Tickets:
 - 135100
  • Loading branch information
e-ddykim authored Mar 13, 2024
1 parent 1a769ce commit be0c954
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 2 deletions.
19 changes: 19 additions & 0 deletions src/plugins/intel_gpu/src/graph/broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,25 @@ std::string broadcast_inst::to_string(broadcast_node const& node) {
return primitive_description.str();
}

void broadcast_inst::on_execute() {
update_output_memory();
}

void broadcast_inst::update_output_memory() {
if (!can_be_optimized())
return;
if (static_cast<bool>(_outputs[0]) && _network.get_engine().is_the_same_buffer(output_memory(), input_memory()))
return;

if (_node != nullptr)
build_deps();

GPU_DEBUG_TRACE_DETAIL << id() << " : update_output_memory with mem of input " << get_node().get_dependency(0).id()
<< " : " << input_memory_ptr()->buffer_ptr() << std::endl;
_outputs[0] = input_memory_ptr();
_mem_allocated = false;
}

broadcast_inst::typed_primitive_inst(network& network, broadcast_node const& node) : parent(network, node) {
auto input_layout = node.get_input_layout();
if (input_layout.is_dynamic())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "strided_slice_inst.h"
#include "kv_cache_inst.h"
#include "gemm_inst.h"
#include "broadcast_inst.h"
#include "program_helpers.h"

using namespace cldnn;
Expand Down Expand Up @@ -95,5 +96,44 @@ void mark_runtime_skippable_nodes::run(program& p) {
node.can_be_optimized(true);
GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl;
});
program_helpers::do_for_types<broadcast>(*node, [](broadcast_node& node){
auto impl_params = node.get_kernel_impl_params();
if (node.is_output()
|| node.has_fused_primitives()
|| (impl_params->get_input_layout(0).format != impl_params->get_output_layout().format)
|| (impl_params->get_input_layout(0).data_type != impl_params->get_output_layout().data_type))
return;

if (node.is_dynamic()) {
// If the user is reorder, it could be fused to broadcast in the remove_redundant_reorders pass.
// In this case, broadcast can not be optimized due to different input and output shapes.
if (node.have_user_with_type<reorder>() && node.get_users().size() == 1)
return;

// Check if the size of rank is different, or if one of static dimensions has different size
auto input_pshape = impl_params->get_input_layout(0).get_partial_shape();
auto output_pshape = impl_params->get_output_layout().get_partial_shape();

if (input_pshape.rank().is_static() && output_pshape.rank().is_static()) {
if (input_pshape.size() != output_pshape.size())
return;

auto input_pdim = input_pshape.begin();
auto output_pdim = output_pshape.begin();
while (input_pdim != input_pshape.end()) {
if (input_pdim->is_static() && output_pdim->is_static()) {
if (input_pdim->get_max_length() != output_pdim->get_max_length())
return;
}

input_pdim++;
output_pdim++;
}
}

node.can_be_optimized(true);
GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl;
}
});
}
}
4 changes: 3 additions & 1 deletion src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "gather_inst.h"
#include "permute_inst.h"
#include "strided_slice_inst.h"
#include "broadcast_inst.h"

#include <vector>
#include <list>
Expand Down Expand Up @@ -88,7 +89,8 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
!((impl_param.is_type<concatenation>() ||
impl_param.is_type<gather>() ||
impl_param.is_type<permute>() ||
impl_param.is_type<strided_slice>()) && impl_param.is_dynamic())) {
impl_param.is_type<strided_slice>() ||
impl_param.is_type<broadcast>()) && impl_param.is_dynamic())) {
return make_unique<ImplType>(kernel_selector::kernel_data{});
}
auto kernel_params = ImplType::get_kernel_params(ImplType::static_canonicalize_shapes(impl_param));
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/broadcast_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ class typed_primitive_inst<broadcast> : public typed_primitive_inst_base<broadca
static layout calc_output_layout(broadcast_node const& node, kernel_impl_params const& impl_param);
static std::string to_string(broadcast_node const& node);
typed_primitive_inst(network& network, broadcast_node const& node);
void update_output_memory() override;

private:
void on_execute() override;
};

using broadcast_inst = typed_primitive_inst<broadcast>;
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ class primitive_inst {
void do_runtime_skip_gather();
void do_runtime_skip_permute();
void do_runtime_skip_strided_slice();
void do_runtime_skip_broadcast();
void do_runtime_in_place_concat();
void do_runtime_in_place_kv_cache();
void configure_shape_of_dependencies();
Expand Down
29 changes: 28 additions & 1 deletion src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "kv_cache_inst.h"
#include "condition_inst.h"
#include "gather_inst.h"
#include "broadcast_inst.h"
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
#include "implementation_map.hpp"
#include "graph_optimizer/prepare_buffer_fusing.h"
Expand Down Expand Up @@ -538,7 +539,8 @@ event::ptr primitive_inst::realloc_if_needed() {
}

// Clear out memory if if was previously reused, but now primitive can't be optimized
if (_node->is_type<gather>() || _node->is_type<permute>() || _node->is_type<reshape>() || _node->is_type<reorder>() || _node->is_type<strided_slice>()) {
if (_node->is_type<gather>() || _node->is_type<permute>() || _node->is_type<reshape>() || _node->is_type<reorder>() ||
_node->is_type<strided_slice>() || _node->is_type<broadcast>()) {
if (can_be_optimized()) {
_max_output_layout_count = _deps[0].first->_max_output_layout_count;
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized");
Expand Down Expand Up @@ -1156,6 +1158,30 @@ void primitive_inst::do_runtime_skip_strided_slice() {
set_can_be_optimized(true);
}

void primitive_inst::do_runtime_skip_broadcast() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_broadcast: " + id()));
// Check pattern
if (!get_node().is_type<broadcast>() || !get_node().can_be_optimized())
return;

GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_broadcast] " << id() << " : check optimizability" << std::endl;
auto input_layout = _impl_params->get_input_layout(0);
auto output_layout = _impl_params->get_output_layout();

// Check runtime shape (need to reset can_be_optimized)
if (input_layout != output_layout) {
set_can_be_optimized(false);
GPU_DEBUG_TRACE_DETAIL << "--- Cannot optimize because input layout(" << input_layout.to_short_string()
<< ") != output layout(" << output_layout.to_short_string() << ")" << std::endl;
return;
}

GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_broadcast] " << id() << " : can_be_optimized" << std::endl;
GPU_DEBUG_TRACE_DETAIL << " - Input layout : " << _impl_params->get_input_layout(0).to_short_string() << std::endl;
GPU_DEBUG_TRACE_DETAIL << " - Output layout : " << _impl_params->get_output_layout().to_short_string() << std::endl;
set_can_be_optimized(true);
}

void primitive_inst::do_runtime_in_place_concat() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_in_place_concat: " + id()));
GPU_DEBUG_GET_INSTANCE(debug_config);
Expand Down Expand Up @@ -1280,6 +1306,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
do_runtime_in_place_kv_cache();
do_runtime_skip_permute();
do_runtime_skip_strided_slice();
do_runtime_skip_broadcast();

if (!is_valid_fusion()) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id()));
Expand Down

0 comments on commit be0c954

Please sign in to comment.