Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Skip broadcast when input and output shapes are identical #23331

Merged
merged 3 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/plugins/intel_gpu/src/graph/broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,25 @@ std::string broadcast_inst::to_string(broadcast_node const& node) {
return primitive_description.str();
}

void broadcast_inst::on_execute() {
update_output_memory();
}

void broadcast_inst::update_output_memory() {
if (!can_be_optimized())
return;
if (static_cast<bool>(_outputs[0]) && _network.get_engine().is_the_same_buffer(output_memory(), input_memory()))
return;

if (_node != nullptr)
build_deps();

GPU_DEBUG_TRACE_DETAIL << id() << " : update_output_memory with mem of input " << get_node().get_dependency(0).id()
<< " : " << input_memory_ptr()->buffer_ptr() << std::endl;
_outputs[0] = input_memory_ptr();
_mem_allocated = false;
}

broadcast_inst::typed_primitive_inst(network& network, broadcast_node const& node) : parent(network, node) {
auto input_layout = node.get_input_layout();
if (input_layout.is_dynamic())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "strided_slice_inst.h"
#include "kv_cache_inst.h"
#include "gemm_inst.h"
#include "broadcast_inst.h"
#include "program_helpers.h"

using namespace cldnn;
Expand Down Expand Up @@ -95,5 +96,44 @@ void mark_runtime_skippable_nodes::run(program& p) {
node.can_be_optimized(true);
GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl;
});
program_helpers::do_for_types<broadcast>(*node, [](broadcast_node& node){
auto impl_params = node.get_kernel_impl_params();
if (node.is_output()
|| node.has_fused_primitives()
|| (impl_params->get_input_layout(0).format != impl_params->get_output_layout().format)
|| (impl_params->get_input_layout(0).data_type != impl_params->get_output_layout().data_type))
return;

if (node.is_dynamic()) {
// If the user is reorder, it could be fused to broadcast in the remove_redundant_reorders pass.
// In this case, broadcast can not be optimized due to different input and output shapes.
if (node.have_user_with_type<reorder>() && node.get_users().size() == 1)
return;

// Check if the size of rank is different, or if one of static dimensions has different size
auto input_pshape = impl_params->get_input_layout(0).get_partial_shape();
auto output_pshape = impl_params->get_output_layout().get_partial_shape();

if (input_pshape.rank().is_static() && output_pshape.rank().is_static()) {
if (input_pshape.size() != output_pshape.size())
return;

auto input_pdim = input_pshape.begin();
auto output_pdim = output_pshape.begin();
while (input_pdim != input_pshape.end()) {
if (input_pdim->is_static() && output_pdim->is_static()) {
if (input_pdim->get_max_length() != output_pdim->get_max_length())
return;
}

input_pdim++;
output_pdim++;
}
}

node.can_be_optimized(true);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any more possibility to reduce the target scope? I am afraid this will let most of the broadcasts be marked as can_be_opt so that less chance of the memory reuse, though the ratio of the actual optimizable cases is considered as small.. For example could we exclude more of "apparently not optimizable cases" like [-1, 1, -1, -1] [-1, 16, -1, -1]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh... I'll try to reduce the target scope as you reviewed. Thank you.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added an additional condition not to skip broadcast. Thank you.

GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl;
}
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "gather_inst.h"
#include "permute_inst.h"
#include "strided_slice_inst.h"
#include "broadcast_inst.h"

#include <vector>
#include <list>
Expand Down Expand Up @@ -88,7 +89,8 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
!((impl_param.is_type<concatenation>() ||
impl_param.is_type<gather>() ||
impl_param.is_type<permute>() ||
impl_param.is_type<strided_slice>()) && impl_param.is_dynamic())) {
impl_param.is_type<strided_slice>() ||
impl_param.is_type<broadcast>()) && impl_param.is_dynamic())) {
return make_unique<ImplType>(kernel_selector::kernel_data{});
}
auto kernel_params = ImplType::get_kernel_params(ImplType::static_canonicalize_shapes(impl_param));
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/broadcast_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ class typed_primitive_inst<broadcast> : public typed_primitive_inst_base<broadca
static layout calc_output_layout(broadcast_node const& node, kernel_impl_params const& impl_param);
static std::string to_string(broadcast_node const& node);
typed_primitive_inst(network& network, broadcast_node const& node);
void update_output_memory() override;

private:
void on_execute() override;
};

using broadcast_inst = typed_primitive_inst<broadcast>;
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ class primitive_inst {
void do_runtime_skip_gather();
void do_runtime_skip_permute();
void do_runtime_skip_strided_slice();
void do_runtime_skip_broadcast();
void do_runtime_in_place_concat();
void do_runtime_in_place_kv_cache();
void configure_shape_of_dependencies();
Expand Down
29 changes: 28 additions & 1 deletion src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "kv_cache_inst.h"
#include "condition_inst.h"
#include "gather_inst.h"
#include "broadcast_inst.h"
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
#include "implementation_map.hpp"
#include "graph_optimizer/prepare_buffer_fusing.h"
Expand Down Expand Up @@ -538,7 +539,8 @@ event::ptr primitive_inst::realloc_if_needed() {
}

// Clear out memory if if was previously reused, but now primitive can't be optimized
if (_node->is_type<gather>() || _node->is_type<permute>() || _node->is_type<reshape>() || _node->is_type<reorder>() || _node->is_type<strided_slice>()) {
if (_node->is_type<gather>() || _node->is_type<permute>() || _node->is_type<reshape>() || _node->is_type<reorder>() ||
_node->is_type<strided_slice>() || _node->is_type<broadcast>()) {
if (can_be_optimized()) {
_max_output_layout_count = _deps[0].first->_max_output_layout_count;
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized");
Expand Down Expand Up @@ -1156,6 +1158,30 @@ void primitive_inst::do_runtime_skip_strided_slice() {
set_can_be_optimized(true);
}

void primitive_inst::do_runtime_skip_broadcast() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_broadcast: " + id()));
// Check pattern
if (!get_node().is_type<broadcast>() || !get_node().can_be_optimized())
return;

GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_broadcast] " << id() << " : check optimizability" << std::endl;
auto input_layout = _impl_params->get_input_layout(0);
auto output_layout = _impl_params->get_output_layout();

// Check runtime shape (need to reset can_be_optimized)
if (input_layout != output_layout) {
set_can_be_optimized(false);
GPU_DEBUG_TRACE_DETAIL << "--- Cannot optimize because input layout(" << input_layout.to_short_string()
<< ") != output layout(" << output_layout.to_short_string() << ")" << std::endl;
return;
}

GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_broadcast] " << id() << " : can_be_optimized" << std::endl;
GPU_DEBUG_TRACE_DETAIL << " - Input layout : " << _impl_params->get_input_layout(0).to_short_string() << std::endl;
GPU_DEBUG_TRACE_DETAIL << " - Output layout : " << _impl_params->get_output_layout().to_short_string() << std::endl;
set_can_be_optimized(true);
}

void primitive_inst::do_runtime_in_place_concat() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_in_place_concat: " + id()));
GPU_DEBUG_GET_INSTANCE(debug_config);
Expand Down Expand Up @@ -1280,6 +1306,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
do_runtime_in_place_kv_cache();
do_runtime_skip_permute();
do_runtime_skip_strided_slice();
do_runtime_skip_broadcast();

if (!is_valid_fusion()) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id()));
Expand Down
Loading