From 8b82aae747c984e873dc2912a7a76ad6a8d5dad3 Mon Sep 17 00:00:00 2001 From: River Li Date: Fri, 16 Aug 2024 12:12:19 +0800 Subject: [PATCH] [GPU] optimize ReduceMax pattern (#24073) ### Details - Optimize ReduceMax pattern to avoid scheduling the whole primitive executed in single EU Sometimes ReduceMax OP is used to convert 3D/4D shape tensor to a scalar output, which leads to all computation are executed in single EUs due to only one output. It causes very poor performance for some models. For example: Grounding DINO model `ReduceMax cost 59.24 ms and cosumed 49% execution time out of whole models.` To break this bottleneck, this PR applies more EUs to execute this primitive by doing ReduceMax one dimension by one dimension. We also notice that the ReduceMax OP selects ref-kernel rather than opt-kernel, which may also cause some performance issue. But it seems the ReduceMax OP doesn't need too much computation, ref-kernel should be enough. The key problem should be only one EU is scheduled to do the whole ReduceMax computation, which is the root cause of poor performace. ![image](https://github.com/openvinotoolkit/openvino/assets/31196718/561b98cc-98af-44a9-9ec2-36e5c63de797) ![image](https://github.com/openvinotoolkit/openvino/assets/31196718/d35c33f9-0714-4871-b049-354ccacd95ea) Test result shows: ReduceMax will be improved from 59.24ms to 2.25ms, fps from 8.24 to 15.55 (+88% improvement) ![image](https://github.com/openvinotoolkit/openvino/assets/31196718/4b981bc5-251c-4913-a5d1-84d5bbd5aec8) ### Tickets: - *145690* --- .../decompose_reduce_scalar_output.cpp | 111 ++++++++++++++ .../decompose_reduce_scalar_output.hpp | 23 +++ .../src/plugin/transformations_pipeline.cpp | 2 + .../decompose_reduce_scalar_output_test.cpp | 140 ++++++++++++++++++ 4 files changed, 276 insertions(+) create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.hpp create mode 100644 src/plugins/intel_gpu/tests/unit/transformations/decompose_reduce_scalar_output_test.cpp diff --git a/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.cpp b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.cpp new file mode 100644 index 00000000000000..7bca11f9143393 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "decompose_reduce_scalar_output.hpp" + +#include +#include +#include + +#include "openvino/core/node.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/reduce_max.hpp" +#include "openvino/op/reduce_min.hpp" +#include "openvino/op/reduce_prod.hpp" +#include "openvino/op/reduce_sum.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/utils.hpp" + +#define CREATE_REDUCE(input, reduce_const, keep_dims) \ + if (ov::is_type(reduce_orig)) \ + reduce_new = std::make_shared(input, reduce_const, keep_dims); \ + else if (ov::is_type(reduce_orig)) \ + reduce_new = std::make_shared(input, reduce_const, keep_dims); \ + else if (ov::is_type(reduce_orig)) \ + reduce_new = std::make_shared(input, reduce_const, keep_dims); \ + else if (ov::is_type(reduce_orig)) \ + reduce_new = std::make_shared(input, reduce_const, keep_dims); \ + else \ + return false; + +ov::intel_gpu::DecomposeReduceForScalarOutput::DecomposeReduceForScalarOutput() { + auto check_reduce_shape = [=](Output output) -> bool { + const auto reduce = ov::as_type_ptr(output.get_node_shared_ptr()); + const auto input_shape = reduce->input_value(0).get_partial_shape(); + const auto reduce_shape = reduce->input_value(1).get_partial_shape(); + if (reduce_shape.is_dynamic() || reduce_shape.size() != 1) { + return false; + } else if (reduce_shape.to_shape()[0] <= 1 || reduce_shape.to_shape()[0] != input_shape.size()) { + return false; + } + const auto output_shape = reduce->get_output_partial_shape(0); + if (output_shape.is_static() && input_shape.is_static()) { + // Output size decides at most how many EU threads can be used for this node execution, + // less than 4 EU threads to execute a primitive will lead to poor performance. + if (ov::shape_size(output_shape.to_shape()) > 4) { + return false; + } + // Input shape is too small, 1 EU thread should be enough. + const auto input_static_shape = input_shape.to_shape(); + if (ov::shape_size(input_static_shape) < 64) { + return false; + } + } + return true; + }; + + auto reduce_pattern = ov::pass::pattern:: + wrap_type( + {ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type()}, + check_reduce_shape); + + // register callback + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + auto reduce_orig = + as_type_ptr(pattern_map.at(reduce_pattern).get_node_shared_ptr()); + if (!reduce_orig || transformation_callback(reduce_orig)) + return false; + + const auto input_shape = reduce_orig->input_value(0).get_partial_shape(); + const auto output_shape = reduce_orig->get_output_partial_shape(0); + bool dynamic_shape = input_shape.is_dynamic() || output_shape.is_dynamic(); + std::shared_ptr reduce_new = nullptr; + if (!dynamic_shape) { + // Find out the the most length dimension + const auto input_static_shape = input_shape.to_shape(); + size_t max_dim = std::distance(input_static_shape.begin(), + std::max_element(input_static_shape.begin(), input_static_shape.end())); + if (input_static_shape[max_dim] == ov::shape_size(input_static_shape)) { + return false; + } + CREATE_REDUCE(reduce_orig->input_value(0), + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {max_dim}), + true); + + } else if (input_shape.rank().is_static()) { + // Dynamic shape and output shape is [0], which will lead to 1 EU thread to do all work. + auto input = reduce_orig->input_value(0); + for (size_t i = input_shape.size() - 1; i > 0; i--) { + // Reduce one dimension by one dimension to avoid 1 EU thread do all work. + if (input_shape[i].is_dynamic() || (input_shape[i].is_static() && input_shape[i].get_length() >= 4)) { + CREATE_REDUCE(input, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {i}), true); + input = reduce_new->get_default_output(); + } + } + } + if (!reduce_new) + return false; + + CREATE_REDUCE(reduce_new->get_default_output(), reduce_orig->input_value(1), reduce_orig->get_keep_dims()); + reduce_new->set_friendly_name(reduce_orig->get_friendly_name()); + copy_runtime_info(reduce_orig, reduce_new); + replace_node(reduce_orig, reduce_new); + return true; + }; + + auto m = std::make_shared(reduce_pattern, "DecomposeReduceForScalarOutput"); + register_matcher(m, callback); +} diff --git a/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.hpp b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.hpp new file mode 100644 index 00000000000000..cb5db2b715c333 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/visibility.hpp" +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +// In some case, Reduce OP is used to reduce one 2D/3D/4D/5D tensor to a scalar output, which leads to all computation +// are executed in single EU thread due to only one output, then fall in very poor performance. This pattern is used to +// detect this case and decompose Reduce by dimension to avoid poor performance. +class DecomposeReduceForScalarOutput : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("DecomposeReduceForScalarOutput", "0"); + DecomposeReduceForScalarOutput(); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 54f5fe5b9360e4..f5cb6783d4b080 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -58,6 +58,7 @@ #include "plugin/transformations/convert_fc_to_compressed.hpp" #include "plugin/transformations/convert_matmul_to_fc.hpp" #include "plugin/transformations/convert_stridedslices_to_variadicsplit.hpp" +#include "plugin/transformations/decompose_reduce_scalar_output.hpp" #include "plugin/transformations/fc_convert_fusion.hpp" #include "plugin/transformations/fc_horizontal_fusion.hpp" #include "plugin/transformations/kv_cache_fusion.hpp" @@ -408,6 +409,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); manager.register_pass(); + manager.register_pass(); precisions_map int_convert_precision_map { {ov::element::i64, ov::element::i32}, diff --git a/src/plugins/intel_gpu/tests/unit/transformations/decompose_reduce_scalar_output_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/decompose_reduce_scalar_output_test.cpp new file mode 100644 index 00000000000000..7f35e39834aacc --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/decompose_reduce_scalar_output_test.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "common_test_utils/ov_test_utils.hpp" +#include "intel_gpu/primitives/reduce.hpp" + +using namespace testing; +using namespace ov::intel_gpu; +using namespace ov; + +template +std::shared_ptr build_model(const ov::PartialShape& input_shape, + const ov::element::Type& input_type, + const std::vector& reduction_axes, + const bool keep_dim) { + const auto in = std::make_shared(input_type, input_shape); + auto reduce = std::make_shared( + in->get_default_output(), + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), + keep_dim); + return std::make_shared(ov::NodeVector{reduce}, ov::ParameterVector{in}); +} + +#define decompose_reduce_static_shape(reduce_type) \ + const ov::PartialShape in_shape = {1, 256, 1024, 10}; \ + const ov::element::Type in_type = ov::element::Type_t::f16; \ + const std::vector reduction_axes = {0, 1, 2, 3}; \ + disable_rt_info_check(); \ + { \ + model = build_model(in_shape, in_type, reduction_axes, false); \ + manager.register_pass(); \ + } \ + { \ + const auto in = std::make_shared(in_type, in_shape); \ + auto reduce = std::make_shared(in->get_default_output(), \ + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \ + true); \ + reduce = std::make_shared( \ + reduce->get_default_output(), \ + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \ + false); \ + model_ref = std::make_shared(ov::NodeVector{reduce}, ov::ParameterVector{in}); \ + } + +// Static shape reduce to scalar output, decompose reduce. +TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceMax)} + +TEST_F(TransformationTestsF, DecomposeReduceMinTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceMin)} + +TEST_F(TransformationTestsF, DecomposeReduceSumTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceSum)} + +TEST_F(TransformationTestsF, + DecomposeReduceProbTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceProd)} + +// Static shape with small input, don't decompose reduce. +TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_small_input_skip) { + const ov::PartialShape in_shape = {1, 2, 8, 2}; + const ov::element::Type in_type = ov::element::Type_t::f16; + const std::vector reduction_axes = {0, 1, 2, 3}; + { + model = build_model(in_shape, in_type, reduction_axes, true); + manager.register_pass(); + } + { model_ref = build_model(in_shape, in_type, reduction_axes, true); } +} + +// Static shape reduce to non scalar output, don't decompose reduce. +TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_skip) { + const ov::PartialShape in_shape = {256, 1024, 10}; + const ov::element::Type in_type = ov::element::Type_t::f16; + const std::vector reduction_axes = {1}; + { + model = build_model(in_shape, in_type, reduction_axes, true); + manager.register_pass(); + } + { model_ref = build_model(in_shape, in_type, reduction_axes, true); } +} + +// Dynamic shape reduce to scalar output, decompose reduce. +#define decompose_reduce_dynamic_shape(reduce_type) \ + const ov::PartialShape in_shape = {4, -1, -1, 10}; \ + const ov::element::Type in_type = ov::element::Type_t::f16; \ + const std::vector reduction_axes = {0, 1, 2, 3}; \ + disable_rt_info_check(); \ + { \ + model = build_model(in_shape, in_type, reduction_axes, false); \ + manager.register_pass(); \ + } \ + { \ + const auto in = std::make_shared(in_type, in_shape); \ + auto reduce = std::make_shared(in->get_default_output(), \ + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), \ + true); \ + reduce = std::make_shared(reduce->get_default_output(), \ + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \ + true); \ + reduce = std::make_shared(reduce->get_default_output(), \ + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}), \ + true); \ + reduce = std::make_shared( \ + reduce->get_default_output(), \ + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \ + false); \ + model_ref = std::make_shared(ov::NodeVector{reduce}, ov::ParameterVector{in}); \ + } + +TEST_F(TransformationTestsF, + DecomposeReduceMaxTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceMax)} + +TEST_F(TransformationTestsF, + DecomposeReduceMinTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceMin)} + +TEST_F(TransformationTestsF, + DecomposeReduceSumTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceSum)} + +TEST_F(TransformationTestsF, + DecomposeReduceProbTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceProd)} + +// Dynamic shape reduce to non-scalar output, don't decompose reduce. +TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape_skip) { + const ov::PartialShape in_shape = {4, -1, -1, 10}; + const ov::element::Type in_type = ov::element::Type_t::f16; + const std::vector reduction_axes = {2}; + { + model = build_model(in_shape, in_type, reduction_axes, false); + manager.register_pass(); + } + { model_ref = build_model(in_shape, in_type, reduction_axes, false); } +} \ No newline at end of file