From 8b82aae747c984e873dc2912a7a76ad6a8d5dad3 Mon Sep 17 00:00:00 2001
From: River Li <river.li@intel.com>
Date: Fri, 16 Aug 2024 12:12:19 +0800
Subject: [PATCH] [GPU] optimize ReduceMax pattern (#24073)

### Details
- Optimize ReduceMax pattern to avoid scheduling the whole primitive
executed in single EU

Sometimes ReduceMax OP is used to convert 3D/4D shape tensor to a scalar
output, which leads to all computation are executed in single EUs due to
only one output. It causes very poor performance for some models.
     For example: Grounding DINO model
`ReduceMax cost 59.24 ms and cosumed 49% execution time out of whole
models.`

To break this bottleneck, this PR applies more EUs to execute this
primitive by doing ReduceMax one dimension by one dimension. We also
notice that the ReduceMax OP selects ref-kernel rather than opt-kernel,
which may also cause some performance issue. But it seems the ReduceMax
OP doesn't need too much computation, ref-kernel should be enough. The
key problem should be only one EU is scheduled to do the whole ReduceMax
computation, which is the root cause of poor performace.



![image](https://github.com/openvinotoolkit/openvino/assets/31196718/561b98cc-98af-44a9-9ec2-36e5c63de797)


![image](https://github.com/openvinotoolkit/openvino/assets/31196718/d35c33f9-0714-4871-b049-354ccacd95ea)



     Test result shows:
ReduceMax will be improved from 59.24ms to 2.25ms, fps from 8.24 to
15.55 (+88% improvement)



![image](https://github.com/openvinotoolkit/openvino/assets/31196718/4b981bc5-251c-4913-a5d1-84d5bbd5aec8)




### Tickets:
 - *145690*
---
 .../decompose_reduce_scalar_output.cpp        | 111 ++++++++++++++
 .../decompose_reduce_scalar_output.hpp        |  23 +++
 .../src/plugin/transformations_pipeline.cpp   |   2 +
 .../decompose_reduce_scalar_output_test.cpp   | 140 ++++++++++++++++++
 4 files changed, 276 insertions(+)
 create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.cpp
 create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.hpp
 create mode 100644 src/plugins/intel_gpu/tests/unit/transformations/decompose_reduce_scalar_output_test.cpp
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.cpp b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.cpp
new file mode 100644
index 00000000000000..7bca11f9143393
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.cpp
@@ -0,0 +1,111 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "decompose_reduce_scalar_output.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "openvino/core/node.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/reduce_max.hpp"
+#include "openvino/op/reduce_min.hpp"
+#include "openvino/op/reduce_prod.hpp"
+#include "openvino/op/reduce_sum.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "transformations/utils/utils.hpp"
+
+#define CREATE_REDUCE(input, reduce_const, keep_dims)                                          \
+    if (ov::is_type<ov::op::v1::ReduceSum>(reduce_orig))                                       \
+        reduce_new = std::make_shared<ov::op::v1::ReduceSum>(input, reduce_const, keep_dims);  \
+    else if (ov::is_type<ov::op::v1::ReduceMin>(reduce_orig))                                  \
+        reduce_new = std::make_shared<ov::op::v1::ReduceMin>(input, reduce_const, keep_dims);  \
+    else if (ov::is_type<ov::op::v1::ReduceMax>(reduce_orig))                                  \
+        reduce_new = std::make_shared<ov::op::v1::ReduceMax>(input, reduce_const, keep_dims);  \
+    else if (ov::is_type<ov::op::v1::ReduceProd>(reduce_orig))                                 \
+        reduce_new = std::make_shared<ov::op::v1::ReduceProd>(input, reduce_const, keep_dims); \
+    else                                                                                       \
+        return false;
+
+ov::intel_gpu::DecomposeReduceForScalarOutput::DecomposeReduceForScalarOutput() {
+    auto check_reduce_shape = [=](Output<Node> output) -> bool {
+        const auto reduce = ov::as_type_ptr<op::util::ArithmeticReductionKeepDims>(output.get_node_shared_ptr());
+        const auto input_shape = reduce->input_value(0).get_partial_shape();
+        const auto reduce_shape = reduce->input_value(1).get_partial_shape();
+        if (reduce_shape.is_dynamic() || reduce_shape.size() != 1) {
+            return false;
+        } else if (reduce_shape.to_shape()[0] <= 1 || reduce_shape.to_shape()[0] != input_shape.size()) {
+            return false;
+        }
+        const auto output_shape = reduce->get_output_partial_shape(0);
+        if (output_shape.is_static() && input_shape.is_static()) {
+            // Output size decides at most how many EU threads can be used for this node execution,
+            // less than 4 EU threads to execute a primitive will lead to poor performance.
+            if (ov::shape_size(output_shape.to_shape()) > 4) {
+                return false;
+            }
+            // Input shape is too small, 1 EU thread should be enough.
+            const auto input_static_shape = input_shape.to_shape();
+            if (ov::shape_size(input_static_shape) < 64) {
+                return false;
+            }
+        }
+        return true;
+    };
+
+    auto reduce_pattern = ov::pass::pattern::
+        wrap_type<ov::op::v1::ReduceSum, ov::op::v1::ReduceProd, ov::op::v1::ReduceMin, ov::op::v1::ReduceMax>(
+            {ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type<ov::op::v0::Constant>()},
+            check_reduce_shape);
+
+    // register callback
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto reduce_orig =
+            as_type_ptr<op::util::ArithmeticReductionKeepDims>(pattern_map.at(reduce_pattern).get_node_shared_ptr());
+        if (!reduce_orig || transformation_callback(reduce_orig))
+            return false;
+
+        const auto input_shape = reduce_orig->input_value(0).get_partial_shape();
+        const auto output_shape = reduce_orig->get_output_partial_shape(0);
+        bool dynamic_shape = input_shape.is_dynamic() || output_shape.is_dynamic();
+        std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce_new = nullptr;
+        if (!dynamic_shape) {
+            // Find out the the most length dimension
+            const auto input_static_shape = input_shape.to_shape();
+            size_t max_dim = std::distance(input_static_shape.begin(),
+                                           std::max_element(input_static_shape.begin(), input_static_shape.end()));
+            if (input_static_shape[max_dim] == ov::shape_size(input_static_shape)) {
+                return false;
+            }
+            CREATE_REDUCE(reduce_orig->input_value(0),
+                          ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {max_dim}),
+                          true);
+
+        } else if (input_shape.rank().is_static()) {
+            // Dynamic shape and output shape is [0], which will lead to 1 EU thread to do all work.
+            auto input = reduce_orig->input_value(0);
+            for (size_t i = input_shape.size() - 1; i > 0; i--) {
+                // Reduce one dimension by one dimension to avoid 1 EU thread do all work.
+                if (input_shape[i].is_dynamic() || (input_shape[i].is_static() && input_shape[i].get_length() >= 4)) {
+                    CREATE_REDUCE(input, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {i}), true);
+                    input = reduce_new->get_default_output();
+                }
+            }
+        }
+        if (!reduce_new)
+            return false;
+
+        CREATE_REDUCE(reduce_new->get_default_output(), reduce_orig->input_value(1), reduce_orig->get_keep_dims());
+        reduce_new->set_friendly_name(reduce_orig->get_friendly_name());
+        copy_runtime_info(reduce_orig, reduce_new);
+        replace_node(reduce_orig, reduce_new);
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(reduce_pattern, "DecomposeReduceForScalarOutput");
+    register_matcher(m, callback);
+}
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.hpp b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.hpp
new file mode 100644
index 00000000000000..cb5db2b715c333
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.hpp
@@ -0,0 +1,23 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/visibility.hpp"
+#include "openvino/pass/graph_rewrite.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+// In some case, Reduce OP is used to reduce one 2D/3D/4D/5D tensor to a scalar output, which leads to all computation
+// are executed in single EU thread due to only one output, then fall in very poor performance. This pattern is used to
+// detect this case and decompose Reduce by dimension to avoid poor performance.
+class DecomposeReduceForScalarOutput : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("DecomposeReduceForScalarOutput", "0");
+    DecomposeReduceForScalarOutput();
+};
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 54f5fe5b9360e4..f5cb6783d4b080 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -58,6 +58,7 @@
 #include "plugin/transformations/convert_fc_to_compressed.hpp"
 #include "plugin/transformations/convert_matmul_to_fc.hpp"
 #include "plugin/transformations/convert_stridedslices_to_variadicsplit.hpp"
+#include "plugin/transformations/decompose_reduce_scalar_output.hpp"
 #include "plugin/transformations/fc_convert_fusion.hpp"
 #include "plugin/transformations/fc_horizontal_fusion.hpp"
 #include "plugin/transformations/kv_cache_fusion.hpp"
@@ -408,6 +409,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         manager.register_pass<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
         manager.register_pass<ov::pass::TransposeMatMul>();
         manager.register_pass<ov::pass::ConvertPad12ToPad1, false>();
+        manager.register_pass<DecomposeReduceForScalarOutput>();
 
         precisions_map int_convert_precision_map {
                 {ov::element::i64, ov::element::i32},
diff --git a/src/plugins/intel_gpu/tests/unit/transformations/decompose_reduce_scalar_output_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/decompose_reduce_scalar_output_test.cpp
new file mode 100644
index 00000000000000..7f35e39834aacc
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/transformations/decompose_reduce_scalar_output_test.cpp
@@ -0,0 +1,140 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <openvino/core/model.hpp>
+#include <openvino/core/node_output.hpp>
+#include <openvino/opsets/opset9.hpp>
+#include <openvino/pass/manager.hpp>
+#include <plugin/transformations/decompose_reduce_scalar_output.hpp>
+#include <string>
+
+#include "common_test_utils/ov_test_utils.hpp"
+#include "intel_gpu/primitives/reduce.hpp"
+
+using namespace testing;
+using namespace ov::intel_gpu;
+using namespace ov;
+
+template <class T>
+std::shared_ptr<ov::Model> build_model(const ov::PartialShape& input_shape,
+                                       const ov::element::Type& input_type,
+                                       const std::vector<size_t>& reduction_axes,
+                                       const bool keep_dim) {
+    const auto in = std::make_shared<ov::op::v0::Parameter>(input_type, input_shape);
+    auto reduce = std::make_shared<T>(
+        in->get_default_output(),
+        ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes),
+        keep_dim);
+    return std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in});
+}
+
+#define decompose_reduce_static_shape(reduce_type)                                                                     \
+    const ov::PartialShape in_shape = {1, 256, 1024, 10};                                                              \
+    const ov::element::Type in_type = ov::element::Type_t::f16;                                                        \
+    const std::vector<size_t> reduction_axes = {0, 1, 2, 3};                                                           \
+    disable_rt_info_check();                                                                                           \
+    {                                                                                                                  \
+        model = build_model<reduce_type>(in_shape, in_type, reduction_axes, false);                                    \
+        manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();                                        \
+    }                                                                                                                  \
+    {                                                                                                                  \
+        const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape);                                    \
+        auto reduce = std::make_shared<reduce_type>(in->get_default_output(),                                          \
+                                                    ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \
+                                                    true);                                                             \
+        reduce = std::make_shared<reduce_type>(                                                                        \
+            reduce->get_default_output(),                                                                              \
+            ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes),          \
+            false);                                                                                                    \
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in});                      \
+    }
+
+// Static shape reduce to scalar output, decompose reduce.
+TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceMax)}
+
+TEST_F(TransformationTestsF, DecomposeReduceMinTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceMin)}
+
+TEST_F(TransformationTestsF, DecomposeReduceSumTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceSum)}
+
+TEST_F(TransformationTestsF,
+       DecomposeReduceProbTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceProd)}
+
+// Static shape with small input, don't decompose reduce.
+TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_small_input_skip) {
+    const ov::PartialShape in_shape = {1, 2, 8, 2};
+    const ov::element::Type in_type = ov::element::Type_t::f16;
+    const std::vector<size_t> reduction_axes = {0, 1, 2, 3};
+    {
+        model = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true);
+        manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
+    }
+    { model_ref = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true); }
+}
+
+// Static shape reduce to non scalar output, don't decompose reduce.
+TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_skip) {
+    const ov::PartialShape in_shape = {256, 1024, 10};
+    const ov::element::Type in_type = ov::element::Type_t::f16;
+    const std::vector<size_t> reduction_axes = {1};
+    {
+        model = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true);
+        manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
+    }
+    { model_ref = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true); }
+}
+
+// Dynamic shape reduce to scalar output, decompose reduce.
+#define decompose_reduce_dynamic_shape(reduce_type)                                                                    \
+    const ov::PartialShape in_shape = {4, -1, -1, 10};                                                                 \
+    const ov::element::Type in_type = ov::element::Type_t::f16;                                                        \
+    const std::vector<size_t> reduction_axes = {0, 1, 2, 3};                                                           \
+    disable_rt_info_check();                                                                                           \
+    {                                                                                                                  \
+        model = build_model<reduce_type>(in_shape, in_type, reduction_axes, false);                                    \
+        manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();                                        \
+    }                                                                                                                  \
+    {                                                                                                                  \
+        const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape);                                    \
+        auto reduce = std::make_shared<reduce_type>(in->get_default_output(),                                          \
+                                                    ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), \
+                                                    true);                                                             \
+        reduce = std::make_shared<reduce_type>(reduce->get_default_output(),                                           \
+                                               ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}),      \
+                                               true);                                                                  \
+        reduce = std::make_shared<reduce_type>(reduce->get_default_output(),                                           \
+                                               ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}),      \
+                                               true);                                                                  \
+        reduce = std::make_shared<reduce_type>(                                                                        \
+            reduce->get_default_output(),                                                                              \
+            ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes),          \
+            false);                                                                                                    \
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in});                      \
+    }
+
+TEST_F(TransformationTestsF,
+       DecomposeReduceMaxTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceMax)}
+
+TEST_F(TransformationTestsF,
+       DecomposeReduceMinTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceMin)}
+
+TEST_F(TransformationTestsF,
+       DecomposeReduceSumTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceSum)}
+
+TEST_F(TransformationTestsF,
+       DecomposeReduceProbTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceProd)}
+
+// Dynamic shape reduce to non-scalar output, don't decompose reduce.
+TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape_skip) {
+    const ov::PartialShape in_shape = {4, -1, -1, 10};
+    const ov::element::Type in_type = ov::element::Type_t::f16;
+    const std::vector<size_t> reduction_axes = {2};
+    {
+        model = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, false);
+        manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
+    }
+    { model_ref = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, false); }
+}
\ No newline at end of file