openvinotoolkit · isanghao · Aug 16, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 18, 2024
@@ -0,0 +1,117 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "decompose_reduce_scalar_output.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "openvino/core/node.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/reduce_max.hpp"
+#include "openvino/op/reduce_mean.hpp"
+#include "openvino/op/reduce_min.hpp"
+#include "openvino/op/reduce_prod.hpp"
+#include "openvino/op/reduce_sum.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "transformations/utils/utils.hpp"
+
+#define CREATE_REDUCE(input, reduce_const, keep_dims)                                          \
+    if (ov::is_type<ov::op::v1::ReduceSum>(reduce_orig))                                       \
+        reduce_new = std::make_shared<ov::op::v1::ReduceSum>(input, reduce_const, keep_dims);  \
+    else if (ov::is_type<ov::op::v1::ReduceMin>(reduce_orig))                                  \
+        reduce_new = std::make_shared<ov::op::v1::ReduceMin>(input, reduce_const, keep_dims);  \
+    else if (ov::is_type<ov::op::v1::ReduceMax>(reduce_orig))                                  \
+        reduce_new = std::make_shared<ov::op::v1::ReduceMax>(input, reduce_const, keep_dims);  \
+    else if (ov::is_type<ov::op::v1::ReduceProd>(reduce_orig))                                 \
+        reduce_new = std::make_shared<ov::op::v1::ReduceProd>(input, reduce_const, keep_dims); \
+    else                                                                                       \
+        return false;
+
+ov::intel_gpu::DecomposeReduceForScalarOutput::DecomposeReduceForScalarOutput() {
+    auto check_reduce_shape = [=](Output<Node> output) -> bool {
+        auto reduce_shape = output.get_partial_shape();
+        if (reduce_shape.is_dynamic() || reduce_shape.size() != 1) {
+            return false;
+        } else if (reduce_shape.to_shape()[0] <= 1) {
+            return false;
+        }
+        return true;
+    };
+    auto reduce_pattern = ov::pass::pattern::wrap_type<ov::op::v1::ReduceSum,
+                                                       ov::op::v1::ReduceMean,
+                                                       ov::op::v1::ReduceProd,
+                                                       ov::op::v1::ReduceMin,
+                                                       ov::op::v1::ReduceMax>(
+        {ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type<ov::op::v0::Constant>(check_reduce_shape)});
+
+    // register callback
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto reduce_orig =
+            as_type_ptr<op::util::ArithmeticReductionKeepDims>(pattern_map.at(reduce_pattern).get_node_shared_ptr());
+        if (!reduce_orig || transformation_callback(reduce_orig))
+            return false;
+
+        const auto input_shape = reduce_orig->input_value(0).get_partial_shape();
+        const auto reduce_shape = reduce_orig->input_value(1).get_partial_shape();
+        if (reduce_shape.to_shape()[0] != input_shape.size())
+            return false;
+
+        auto dynamic_shape = false;
+        const auto output_shape = reduce_orig->get_output_partial_shape(0);
+        if (input_shape.is_dynamic() || output_shape.is_dynamic()) {
+            dynamic_shape = true;
+        }
+
+        std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce_new = nullptr;
+        if (!dynamic_shape) {
+            // Output size decides at most how many EU threads can be used for this node execution,
+            // less than 4 EU threads to execute a primitive will lead to poor performance.
+            if (ov::shape_size(output_shape.to_shape()) > 4) {
+                return false;
+            }
+            // Input shape is too small, 1 EU thread should be enough.
+            const auto input_static_shape = input_shape.to_shape();
+            if (ov::shape_size(input_static_shape) < 64) {
+                return false;
+            }
+
+            // Find out the the most length dimension
+            size_t max_dim = std::distance(input_static_shape.begin(),
+                                           std::max_element(input_static_shape.begin(), input_static_shape.end()));
+            if (input_static_shape[max_dim] == ov::shape_size(input_static_shape)) {
+                return false;
+            }
+
+            CREATE_REDUCE(reduce_orig->input_value(0),
+                          ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {max_dim}),
+                          true);
+
+        } else if (input_shape.rank().is_static()) {
+            // Dynamic shape and output shape is [0], which will lead to 1 EU thread to do all work.
+            auto input = reduce_orig->input_value(0);
+            for (size_t i = input_shape.size() - 1; i > 0; i--) {
+                // Reduce one dimension by one dimension to avoid 1 EU thread do all work.
+                if (input_shape[i].is_dynamic() || (input_shape[i].is_static() && input_shape[i].get_length() >= 4)) {
+                    CREATE_REDUCE(input, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {i}), true);
+                    input = reduce_new->get_default_output();
+                }
+            }
+        }
+        if (!reduce_new)
+            return false;
+
+        CREATE_REDUCE(reduce_new->get_default_output(), reduce_orig->input_value(1), reduce_orig->get_keep_dims());
+        reduce_new->set_friendly_name(reduce_orig->get_friendly_name());
+        copy_runtime_info(reduce_orig, reduce_new);
+        replace_node(reduce_orig, reduce_new);
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(reduce_pattern, "DecomposeReduceForScalarOutput");
+    register_matcher(m, callback);
+}
@@ -0,0 +1,23 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/visibility.hpp"
+#include "openvino/pass/graph_rewrite.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+// In some case, Reduce OP is used to reduce one 2D/3D/4D/5D tensor to a scalar output, which leads to all computation
+// are executed in single EU thread due to only one output, then fall in very poor performance. This pattern is used to
+// detect this case and decompose Reduce by dimension to avoid poor performance.
+class DecomposeReduceForScalarOutput : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("DecomposeReduceForScalarOutput", "0");
+    DecomposeReduceForScalarOutput();
+};
+
+}  // namespace intel_gpu
+}  // namespace ov
@@ -58,6 +58,7 @@
 #include "plugin/transformations/convert_fc_to_compressed.hpp"
 #include "plugin/transformations/convert_matmul_to_fc.hpp"
 #include "plugin/transformations/convert_stridedslices_to_variadicsplit.hpp"
+#include "plugin/transformations/decompose_reduce_scalar_output.hpp"
 #include "plugin/transformations/fc_convert_fusion.hpp"
 #include "plugin/transformations/fc_horizontal_fusion.hpp"
 #include "plugin/transformations/kv_cache_fusion.hpp"
@@ -408,6 +409,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         manager.register_pass<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
         manager.register_pass<ov::pass::TransposeMatMul>();
         manager.register_pass<ov::pass::ConvertPad12ToPad1, false>();
+        manager.register_pass<DecomposeReduceForScalarOutput>();
 
         precisions_map int_convert_precision_map {
                 {ov::element::i64, ov::element::i32},

@@ -0,0 +1,144 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <openvino/core/model.hpp>
+#include <openvino/opsets/opset9.hpp>
+#include <openvino/pass/manager.hpp>
+#include <plugin/transformations/decompose_reduce_scalar_output.hpp>
+#include <string>
+
+#include "common_test_utils/ov_test_utils.hpp"
+#include "intel_gpu/primitives/reduce.hpp"
+
+using namespace testing;
+using namespace ov::intel_gpu;
+using namespace cldnn;
+using ReduceType = cldnn::reduce_mode;
+
+#define create_reduce(arg, reduction, keep_dims, reduce_type)                         \
+    if (reduce_type == reduce_mode::sum)                                              \
+        reduce = std::make_shared<ov::op::v1::ReduceSum>(arg, reduction, keep_dims);  \
+    else if (reduce_type == reduce_mode::min)                                         \
+        reduce = std::make_shared<ov::op::v1::ReduceMin>(arg, reduction, keep_dims);  \
+    else if (reduce_type == reduce_mode::max)                                         \
+        reduce = std::make_shared<ov::op::v1::ReduceMax>(arg, reduction, keep_dims);  \
+    else if (reduce_type == reduce_mode::prod)                                        \
+        reduce = std::make_shared<ov::op::v1::ReduceProd>(arg, reduction, keep_dims); \
+    OPENVINO_ASSERT(reduce != nullptr, "cannot create reduce: ", static_cast<int>(reduce_type));
+
+static std::shared_ptr<ov::Model> build_model(const ov::PartialShape& input_shape,
+                                              const ov::element::Type& input_type,
+                                              const std::vector<size_t>& reduction_axes,
+                                              const bool keep_dim,
+                                              const ReduceType reduce_type) {
+    const auto in = std::make_shared<ov::op::v0::Parameter>(input_type, input_shape);
+    std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce = nullptr;
+    create_reduce(in->get_default_output(),
+                  ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes),
+                  keep_dim,
+                  reduce_type);
+
+    return std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in});
+}
+
+#define decompose_reduce_static_shape(reduce_type)                                                            \
+    const ov::PartialShape in_shape = {1, 256, 1024, 10};                                                     \
+    const ov::element::Type in_type = ov::element::Type_t::f16;                                               \
+    const std::vector<size_t> reduction_axes = {0, 1, 2, 3};                                                  \
+    disable_rt_info_check();                                                                                  \
+    {                                                                                                         \
+        model = build_model(in_shape, in_type, reduction_axes, false, reduce_type);                           \
+        manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();                               \
+    }                                                                                                         \
+    {                                                                                                         \
+        const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape);                           \
+        std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce = nullptr;                          \
+        create_reduce(in->get_default_output(),                                                               \
+                      ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}),                      \
+                      true,                                                                                   \
+                      reduce_type);                                                                           \
+        create_reduce(                                                                                        \
+            reduce->get_default_output(),                                                                     \
+            ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \
+            false,                                                                                            \
+            reduce_mode::max);                                                                                \
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in});             \
+    }
+
+// Static shape reduce to scalar output, decompose reduce.
+TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape){decompose_reduce_static_shape(reduce_mode::max)}
+
+TEST_F(TransformationTestsF, DecomposeReduceMinTest_static_shape){decompose_reduce_static_shape(reduce_mode::min)}
+
+TEST_F(TransformationTestsF, DecomposeReduceSumTest_static_shape){decompose_reduce_static_shape(reduce_mode::sum)}
+
+TEST_F(TransformationTestsF, DecomposeReduceProbTest_static_shape){decompose_reduce_static_shape(reduce_mode::prod)}
+
+// Static shape reduce to non scalar output, don't decompose reduce.
+TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_skip) {
+    const ov::PartialShape in_shape = {256, 1024, 10};
+    const ov::element::Type in_type = ov::element::Type_t::f16;
+    const std::vector<size_t> reduction_axes = {1};
+    {
+        model = build_model(in_shape, in_type, reduction_axes, true, reduce_mode::max);
+        manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
+    }
+    { model_ref = build_model(in_shape, in_type, reduction_axes, true, reduce_mode::max); }
+}
+
+// Dynamic shape reduce to scalar output, decompose reduce.
+#define decompose_reduce_dynamic_shape(reduce_type)                                                           \
+    const ov::PartialShape in_shape = {4, -1, -1, 10};                                                        \
+    const ov::element::Type in_type = ov::element::Type_t::f16;                                               \
+    const std::vector<size_t> reduction_axes = {0, 1, 2, 3};                                                  \
+    disable_rt_info_check();                                                                                  \
+    {                                                                                                         \
+        model = build_model(in_shape, in_type, reduction_axes, false, reduce_mode::max);                      \
+        manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();                               \
+    }                                                                                                         \
+    {                                                                                                         \
+        const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape);                           \
+        std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce = nullptr;                          \
+        create_reduce(in->get_default_output(),                                                               \
+                      ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}),                      \
+                      true,                                                                                   \
+                      reduce_type);                                                                           \
+        create_reduce(reduce->get_default_output(),                                                           \
+                      ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}),                      \
+                      true,                                                                                   \
+                      reduce_type);                                                                           \
+        create_reduce(reduce->get_default_output(),                                                           \
+                      ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}),                      \
+                      true,                                                                                   \
+                      reduce_type);                                                                           \
+        create_reduce(                                                                                        \
+            reduce->get_default_output(),                                                                     \
+            ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \
+            false,                                                                                            \
+            reduce_type);                                                                                     \
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in});             \
+    }
+
+TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::max)}
+
+TEST_F(TransformationTestsF, DecomposeReduceMinTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::min)}
+
+TEST_F(TransformationTestsF, DecomposeReduceSumTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::sum)}
+
+TEST_F(TransformationTestsF, DecomposeReduceProbTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::prod)}
+
+// Dynamic shape reduce to non-scalar output, don't decompose reduce.
+TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape_skip) {
+    const ov::PartialShape in_shape = {4, -1, -1, 10};
+    const ov::element::Type in_type = ov::element::Type_t::f16;
+    const std::vector<size_t> reduction_axes = {2};
+    {
+        model = build_model(in_shape, in_type, reduction_axes, false, reduce_mode::max);
+        manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
+    }
+    { model_ref = build_model(in_shape, in_type, reduction_axes, false, reduce_mode::max); }
+}