Skip to content

Commit

Permalink
[GPU] optimize ReduceMax pattern (#24073)
Browse files Browse the repository at this point in the history
### Details
- Optimize ReduceMax pattern to avoid scheduling the whole primitive
executed in single EU

Sometimes ReduceMax OP is used to convert 3D/4D shape tensor to a scalar
output, which leads to all computation are executed in single EUs due to
only one output. It causes very poor performance for some models.
     For example: Grounding DINO model
`ReduceMax cost 59.24 ms and cosumed 49% execution time out of whole
models.`

To break this bottleneck, this PR applies more EUs to execute this
primitive by doing ReduceMax one dimension by one dimension. We also
notice that the ReduceMax OP selects ref-kernel rather than opt-kernel,
which may also cause some performance issue. But it seems the ReduceMax
OP doesn't need too much computation, ref-kernel should be enough. The
key problem should be only one EU is scheduled to do the whole ReduceMax
computation, which is the root cause of poor performace.

    

![image](https://github.com/openvinotoolkit/openvino/assets/31196718/561b98cc-98af-44a9-9ec2-36e5c63de797)


![image](https://github.com/openvinotoolkit/openvino/assets/31196718/d35c33f9-0714-4871-b049-354ccacd95ea)



     Test result shows:
ReduceMax will be improved from 59.24ms to 2.25ms, fps from 8.24 to
15.55 (+88% improvement)

          

![image](https://github.com/openvinotoolkit/openvino/assets/31196718/4b981bc5-251c-4913-a5d1-84d5bbd5aec8)

               


### Tickets:
 - *145690*
  • Loading branch information
riverlijunjie authored Aug 16, 2024
1 parent c6ab24a commit 8b82aae
Show file tree
Hide file tree
Showing 4 changed files with 276 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "decompose_reduce_scalar_output.hpp"

#include <algorithm>
#include <memory>
#include <vector>

#include "openvino/core/node.hpp"
#include "openvino/core/rt_info.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/reduce_max.hpp"
#include "openvino/op/reduce_min.hpp"
#include "openvino/op/reduce_prod.hpp"
#include "openvino/op/reduce_sum.hpp"
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "transformations/utils/utils.hpp"

#define CREATE_REDUCE(input, reduce_const, keep_dims) \
if (ov::is_type<ov::op::v1::ReduceSum>(reduce_orig)) \
reduce_new = std::make_shared<ov::op::v1::ReduceSum>(input, reduce_const, keep_dims); \
else if (ov::is_type<ov::op::v1::ReduceMin>(reduce_orig)) \
reduce_new = std::make_shared<ov::op::v1::ReduceMin>(input, reduce_const, keep_dims); \
else if (ov::is_type<ov::op::v1::ReduceMax>(reduce_orig)) \
reduce_new = std::make_shared<ov::op::v1::ReduceMax>(input, reduce_const, keep_dims); \
else if (ov::is_type<ov::op::v1::ReduceProd>(reduce_orig)) \
reduce_new = std::make_shared<ov::op::v1::ReduceProd>(input, reduce_const, keep_dims); \
else \
return false;

ov::intel_gpu::DecomposeReduceForScalarOutput::DecomposeReduceForScalarOutput() {
auto check_reduce_shape = [=](Output<Node> output) -> bool {
const auto reduce = ov::as_type_ptr<op::util::ArithmeticReductionKeepDims>(output.get_node_shared_ptr());
const auto input_shape = reduce->input_value(0).get_partial_shape();
const auto reduce_shape = reduce->input_value(1).get_partial_shape();
if (reduce_shape.is_dynamic() || reduce_shape.size() != 1) {
return false;
} else if (reduce_shape.to_shape()[0] <= 1 || reduce_shape.to_shape()[0] != input_shape.size()) {
return false;
}
const auto output_shape = reduce->get_output_partial_shape(0);
if (output_shape.is_static() && input_shape.is_static()) {
// Output size decides at most how many EU threads can be used for this node execution,
// less than 4 EU threads to execute a primitive will lead to poor performance.
if (ov::shape_size(output_shape.to_shape()) > 4) {
return false;
}
// Input shape is too small, 1 EU thread should be enough.
const auto input_static_shape = input_shape.to_shape();
if (ov::shape_size(input_static_shape) < 64) {
return false;
}
}
return true;
};

auto reduce_pattern = ov::pass::pattern::
wrap_type<ov::op::v1::ReduceSum, ov::op::v1::ReduceProd, ov::op::v1::ReduceMin, ov::op::v1::ReduceMax>(
{ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type<ov::op::v0::Constant>()},
check_reduce_shape);

// register callback
ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
const auto& pattern_map = m.get_pattern_value_map();
auto reduce_orig =
as_type_ptr<op::util::ArithmeticReductionKeepDims>(pattern_map.at(reduce_pattern).get_node_shared_ptr());
if (!reduce_orig || transformation_callback(reduce_orig))
return false;

const auto input_shape = reduce_orig->input_value(0).get_partial_shape();
const auto output_shape = reduce_orig->get_output_partial_shape(0);
bool dynamic_shape = input_shape.is_dynamic() || output_shape.is_dynamic();
std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce_new = nullptr;
if (!dynamic_shape) {
// Find out the the most length dimension
const auto input_static_shape = input_shape.to_shape();
size_t max_dim = std::distance(input_static_shape.begin(),
std::max_element(input_static_shape.begin(), input_static_shape.end()));
if (input_static_shape[max_dim] == ov::shape_size(input_static_shape)) {
return false;
}
CREATE_REDUCE(reduce_orig->input_value(0),
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {max_dim}),
true);

} else if (input_shape.rank().is_static()) {
// Dynamic shape and output shape is [0], which will lead to 1 EU thread to do all work.
auto input = reduce_orig->input_value(0);
for (size_t i = input_shape.size() - 1; i > 0; i--) {
// Reduce one dimension by one dimension to avoid 1 EU thread do all work.
if (input_shape[i].is_dynamic() || (input_shape[i].is_static() && input_shape[i].get_length() >= 4)) {
CREATE_REDUCE(input, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {i}), true);
input = reduce_new->get_default_output();
}
}
}
if (!reduce_new)
return false;

CREATE_REDUCE(reduce_new->get_default_output(), reduce_orig->input_value(1), reduce_orig->get_keep_dims());
reduce_new->set_friendly_name(reduce_orig->get_friendly_name());
copy_runtime_info(reduce_orig, reduce_new);
replace_node(reduce_orig, reduce_new);
return true;
};

auto m = std::make_shared<ov::pass::pattern::Matcher>(reduce_pattern, "DecomposeReduceForScalarOutput");
register_matcher(m, callback);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/core/visibility.hpp"
#include "openvino/pass/graph_rewrite.hpp"

namespace ov {
namespace intel_gpu {

// In some case, Reduce OP is used to reduce one 2D/3D/4D/5D tensor to a scalar output, which leads to all computation
// are executed in single EU thread due to only one output, then fall in very poor performance. This pattern is used to
// detect this case and decompose Reduce by dimension to avoid poor performance.
class DecomposeReduceForScalarOutput : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("DecomposeReduceForScalarOutput", "0");
DecomposeReduceForScalarOutput();
};

} // namespace intel_gpu
} // namespace ov
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include "plugin/transformations/convert_fc_to_compressed.hpp"
#include "plugin/transformations/convert_matmul_to_fc.hpp"
#include "plugin/transformations/convert_stridedslices_to_variadicsplit.hpp"
#include "plugin/transformations/decompose_reduce_scalar_output.hpp"
#include "plugin/transformations/fc_convert_fusion.hpp"
#include "plugin/transformations/fc_horizontal_fusion.hpp"
#include "plugin/transformations/kv_cache_fusion.hpp"
Expand Down Expand Up @@ -408,6 +409,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
manager.register_pass<ov::pass::TransposeMatMul>();
manager.register_pass<ov::pass::ConvertPad12ToPad1, false>();
manager.register_pass<DecomposeReduceForScalarOutput>();

precisions_map int_convert_precision_map {
{ov::element::i64, ov::element::i32},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <gtest/gtest.h>

#include <memory>
#include <openvino/core/model.hpp>
#include <openvino/core/node_output.hpp>
#include <openvino/opsets/opset9.hpp>
#include <openvino/pass/manager.hpp>
#include <plugin/transformations/decompose_reduce_scalar_output.hpp>
#include <string>

#include "common_test_utils/ov_test_utils.hpp"
#include "intel_gpu/primitives/reduce.hpp"

using namespace testing;
using namespace ov::intel_gpu;
using namespace ov;

template <class T>
std::shared_ptr<ov::Model> build_model(const ov::PartialShape& input_shape,
const ov::element::Type& input_type,
const std::vector<size_t>& reduction_axes,
const bool keep_dim) {
const auto in = std::make_shared<ov::op::v0::Parameter>(input_type, input_shape);
auto reduce = std::make_shared<T>(
in->get_default_output(),
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes),
keep_dim);
return std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in});
}

#define decompose_reduce_static_shape(reduce_type) \
const ov::PartialShape in_shape = {1, 256, 1024, 10}; \
const ov::element::Type in_type = ov::element::Type_t::f16; \
const std::vector<size_t> reduction_axes = {0, 1, 2, 3}; \
disable_rt_info_check(); \
{ \
model = build_model<reduce_type>(in_shape, in_type, reduction_axes, false); \
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); \
} \
{ \
const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape); \
auto reduce = std::make_shared<reduce_type>(in->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \
true); \
reduce = std::make_shared<reduce_type>( \
reduce->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \
false); \
model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); \
}

// Static shape reduce to scalar output, decompose reduce.
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceMax)}

TEST_F(TransformationTestsF, DecomposeReduceMinTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceMin)}

TEST_F(TransformationTestsF, DecomposeReduceSumTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceSum)}

TEST_F(TransformationTestsF,
DecomposeReduceProbTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceProd)}

// Static shape with small input, don't decompose reduce.
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_small_input_skip) {
const ov::PartialShape in_shape = {1, 2, 8, 2};
const ov::element::Type in_type = ov::element::Type_t::f16;
const std::vector<size_t> reduction_axes = {0, 1, 2, 3};
{
model = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true);
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
}
{ model_ref = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true); }
}

// Static shape reduce to non scalar output, don't decompose reduce.
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_skip) {
const ov::PartialShape in_shape = {256, 1024, 10};
const ov::element::Type in_type = ov::element::Type_t::f16;
const std::vector<size_t> reduction_axes = {1};
{
model = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true);
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
}
{ model_ref = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true); }
}

// Dynamic shape reduce to scalar output, decompose reduce.
#define decompose_reduce_dynamic_shape(reduce_type) \
const ov::PartialShape in_shape = {4, -1, -1, 10}; \
const ov::element::Type in_type = ov::element::Type_t::f16; \
const std::vector<size_t> reduction_axes = {0, 1, 2, 3}; \
disable_rt_info_check(); \
{ \
model = build_model<reduce_type>(in_shape, in_type, reduction_axes, false); \
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); \
} \
{ \
const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape); \
auto reduce = std::make_shared<reduce_type>(in->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), \
true); \
reduce = std::make_shared<reduce_type>(reduce->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \
true); \
reduce = std::make_shared<reduce_type>(reduce->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}), \
true); \
reduce = std::make_shared<reduce_type>( \
reduce->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \
false); \
model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); \
}

TEST_F(TransformationTestsF,
DecomposeReduceMaxTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceMax)}

TEST_F(TransformationTestsF,
DecomposeReduceMinTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceMin)}

TEST_F(TransformationTestsF,
DecomposeReduceSumTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceSum)}

TEST_F(TransformationTestsF,
DecomposeReduceProbTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceProd)}

// Dynamic shape reduce to non-scalar output, don't decompose reduce.
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape_skip) {
const ov::PartialShape in_shape = {4, -1, -1, 10};
const ov::element::Type in_type = ov::element::Type_t::f16;
const std::vector<size_t> reduction_axes = {2};
{
model = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, false);
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
}
{ model_ref = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, false); }
}

0 comments on commit 8b82aae

Please sign in to comment.