Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] optimize ReduceMax pattern #24073

Merged
Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "decompose_reduce_scalar_output.hpp"

#include <algorithm>
#include <memory>
#include <vector>

#include "openvino/core/node.hpp"
#include "openvino/core/rt_info.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/reduce_max.hpp"
#include "openvino/op/reduce_mean.hpp"
#include "openvino/op/reduce_min.hpp"
#include "openvino/op/reduce_prod.hpp"
#include "openvino/op/reduce_sum.hpp"
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "transformations/utils/utils.hpp"

#define CREATE_REDUCE(input, reduce_const, keep_dims) \
if (ov::is_type<ov::op::v1::ReduceSum>(reduce_orig)) \
reduce_new = std::make_shared<ov::op::v1::ReduceSum>(input, reduce_const, keep_dims); \
else if (ov::is_type<ov::op::v1::ReduceMin>(reduce_orig)) \
reduce_new = std::make_shared<ov::op::v1::ReduceMin>(input, reduce_const, keep_dims); \
else if (ov::is_type<ov::op::v1::ReduceMax>(reduce_orig)) \
reduce_new = std::make_shared<ov::op::v1::ReduceMax>(input, reduce_const, keep_dims); \
else if (ov::is_type<ov::op::v1::ReduceProd>(reduce_orig)) \
reduce_new = std::make_shared<ov::op::v1::ReduceProd>(input, reduce_const, keep_dims); \
else \
return false;

ov::intel_gpu::DecomposeReduceForScalarOutput::DecomposeReduceForScalarOutput() {
auto check_reduce_shape = [=](Output<Node> output) -> bool {
auto reduce_shape = output.get_partial_shape();
if (reduce_shape.is_dynamic() || reduce_shape.size() != 1) {
return false;
} else if (reduce_shape.to_shape()[0] <= 1) {
return false;
}
return true;
};
auto reduce_pattern = ov::pass::pattern::wrap_type<ov::op::v1::ReduceSum,
ov::op::v1::ReduceMean,
ov::op::v1::ReduceProd,
ov::op::v1::ReduceMin,
ov::op::v1::ReduceMax>(
{ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type<ov::op::v0::Constant>(check_reduce_shape)});

// register callback
ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
const auto& pattern_map = m.get_pattern_value_map();
auto reduce_orig =
as_type_ptr<op::util::ArithmeticReductionKeepDims>(pattern_map.at(reduce_pattern).get_node_shared_ptr());
if (!reduce_orig || transformation_callback(reduce_orig))
return false;

const auto input_shape = reduce_orig->input_value(0).get_partial_shape();
const auto reduce_shape = reduce_orig->input_value(1).get_partial_shape();
if (reduce_shape.to_shape()[0] != input_shape.size())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: that can also be a part of reduce predicate I believe

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

return false;

auto dynamic_shape = false;
const auto output_shape = reduce_orig->get_output_partial_shape(0);
if (input_shape.is_dynamic() || output_shape.is_dynamic()) {
dynamic_shape = true;
}

std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce_new = nullptr;
if (!dynamic_shape) {
// Output size decides at most how many EU threads can be used for this node execution,
// less than 4 EU threads to execute a primitive will lead to poor performance.
if (ov::shape_size(output_shape.to_shape()) > 4) {
return false;
}
// Input shape is too small, 1 EU thread should be enough.
const auto input_static_shape = input_shape.to_shape();
if (ov::shape_size(input_static_shape) < 64) {
return false;
}

// Find out the the most length dimension
size_t max_dim = std::distance(input_static_shape.begin(),
std::max_element(input_static_shape.begin(), input_static_shape.end()));
if (input_static_shape[max_dim] == ov::shape_size(input_static_shape)) {
return false;
}

CREATE_REDUCE(reduce_orig->input_value(0),
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {max_dim}),
true);

} else if (input_shape.rank().is_static()) {
// Dynamic shape and output shape is [0], which will lead to 1 EU thread to do all work.
auto input = reduce_orig->input_value(0);
for (size_t i = input_shape.size() - 1; i > 0; i--) {
// Reduce one dimension by one dimension to avoid 1 EU thread do all work.
if (input_shape[i].is_dynamic() || (input_shape[i].is_static() && input_shape[i].get_length() >= 4)) {
CREATE_REDUCE(input, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {i}), true);
input = reduce_new->get_default_output();
}
}
}
if (!reduce_new)
return false;

CREATE_REDUCE(reduce_new->get_default_output(), reduce_orig->input_value(1), reduce_orig->get_keep_dims());
reduce_new->set_friendly_name(reduce_orig->get_friendly_name());
copy_runtime_info(reduce_orig, reduce_new);
replace_node(reduce_orig, reduce_new);
return true;
};

auto m = std::make_shared<ov::pass::pattern::Matcher>(reduce_pattern, "DecomposeReduceForScalarOutput");
register_matcher(m, callback);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/core/visibility.hpp"
#include "openvino/pass/graph_rewrite.hpp"

namespace ov {
namespace intel_gpu {

// In some case, Reduce OP is used to reduce one 2D/3D/4D/5D tensor to a scalar output, which leads to all computation
// are executed in single EU thread due to only one output, then fall in very poor performance. This pattern is used to
// detect this case and decompose Reduce by dimension to avoid poor performance.
class DecomposeReduceForScalarOutput : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("DecomposeReduceForScalarOutput", "0");
DecomposeReduceForScalarOutput();
};

} // namespace intel_gpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include "plugin/transformations/convert_fc_to_compressed.hpp"
#include "plugin/transformations/convert_matmul_to_fc.hpp"
#include "plugin/transformations/convert_stridedslices_to_variadicsplit.hpp"
#include "plugin/transformations/decompose_reduce_scalar_output.hpp"
#include "plugin/transformations/fc_convert_fusion.hpp"
#include "plugin/transformations/fc_horizontal_fusion.hpp"
#include "plugin/transformations/kv_cache_fusion.hpp"
Expand Down Expand Up @@ -408,6 +409,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
manager.register_pass<ov::pass::TransposeMatMul>();
manager.register_pass<ov::pass::ConvertPad12ToPad1, false>();
manager.register_pass<DecomposeReduceForScalarOutput>();

precisions_map int_convert_precision_map {
{ov::element::i64, ov::element::i32},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <gtest/gtest.h>

#include <memory>
#include <openvino/core/model.hpp>
#include <openvino/opsets/opset9.hpp>
#include <openvino/pass/manager.hpp>
#include <plugin/transformations/decompose_reduce_scalar_output.hpp>
#include <string>

#include "common_test_utils/ov_test_utils.hpp"
#include "intel_gpu/primitives/reduce.hpp"

using namespace testing;
using namespace ov::intel_gpu;
using namespace cldnn;
using ReduceType = cldnn::reduce_mode;

#define create_reduce(arg, reduction, keep_dims, reduce_type) \
if (reduce_type == reduce_mode::sum) \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please don't use cldnn things in transformation unit tests. You can pass op type to this macro instead of enum value:

#define create_reduce(arg, reduction, keep_dims, ReduceType) \
    reduce = std::make_shared<ReduceType>(arg, reduction, keep_dims);

Also, I'd suggest replacing macro with template

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

reduce = std::make_shared<ov::op::v1::ReduceSum>(arg, reduction, keep_dims); \
else if (reduce_type == reduce_mode::min) \
reduce = std::make_shared<ov::op::v1::ReduceMin>(arg, reduction, keep_dims); \
else if (reduce_type == reduce_mode::max) \
reduce = std::make_shared<ov::op::v1::ReduceMax>(arg, reduction, keep_dims); \
else if (reduce_type == reduce_mode::prod) \
reduce = std::make_shared<ov::op::v1::ReduceProd>(arg, reduction, keep_dims); \
OPENVINO_ASSERT(reduce != nullptr, "cannot create reduce: ", static_cast<int>(reduce_type));

static std::shared_ptr<ov::Model> build_model(const ov::PartialShape& input_shape,
const ov::element::Type& input_type,
const std::vector<size_t>& reduction_axes,
const bool keep_dim,
const ReduceType reduce_type) {
const auto in = std::make_shared<ov::op::v0::Parameter>(input_type, input_shape);
std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce = nullptr;
create_reduce(in->get_default_output(),
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes),
keep_dim,
reduce_type);

return std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in});
}

#define decompose_reduce_static_shape(reduce_type) \
const ov::PartialShape in_shape = {1, 256, 1024, 10}; \
const ov::element::Type in_type = ov::element::Type_t::f16; \
const std::vector<size_t> reduction_axes = {0, 1, 2, 3}; \
disable_rt_info_check(); \
{ \
model = build_model(in_shape, in_type, reduction_axes, false, reduce_type); \
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); \
} \
{ \
const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape); \
std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce = nullptr; \
create_reduce(in->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \
true, \
reduce_type); \
create_reduce( \
reduce->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \
false, \
reduce_mode::max); \
model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); \
}

// Static shape reduce to scalar output, decompose reduce.
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape){decompose_reduce_static_shape(reduce_mode::max)}

TEST_F(TransformationTestsF, DecomposeReduceMinTest_static_shape){decompose_reduce_static_shape(reduce_mode::min)}

TEST_F(TransformationTestsF, DecomposeReduceSumTest_static_shape){decompose_reduce_static_shape(reduce_mode::sum)}

TEST_F(TransformationTestsF, DecomposeReduceProbTest_static_shape){decompose_reduce_static_shape(reduce_mode::prod)}

// Static shape reduce to non scalar output, don't decompose reduce.
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_skip) {
const ov::PartialShape in_shape = {256, 1024, 10};
const ov::element::Type in_type = ov::element::Type_t::f16;
const std::vector<size_t> reduction_axes = {1};
{
model = build_model(in_shape, in_type, reduction_axes, true, reduce_mode::max);
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
}
{ model_ref = build_model(in_shape, in_type, reduction_axes, true, reduce_mode::max); }
}

// Dynamic shape reduce to scalar output, decompose reduce.
#define decompose_reduce_dynamic_shape(reduce_type) \
const ov::PartialShape in_shape = {4, -1, -1, 10}; \
const ov::element::Type in_type = ov::element::Type_t::f16; \
const std::vector<size_t> reduction_axes = {0, 1, 2, 3}; \
disable_rt_info_check(); \
{ \
model = build_model(in_shape, in_type, reduction_axes, false, reduce_mode::max); \
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); \
} \
{ \
const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape); \
std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce = nullptr; \
create_reduce(in->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), \
true, \
reduce_type); \
create_reduce(reduce->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \
true, \
reduce_type); \
create_reduce(reduce->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}), \
true, \
reduce_type); \
create_reduce( \
reduce->get_default_output(), \
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \
false, \
reduce_type); \
model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); \
}

TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::max)}

TEST_F(TransformationTestsF, DecomposeReduceMinTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::min)}

TEST_F(TransformationTestsF, DecomposeReduceSumTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::sum)}

TEST_F(TransformationTestsF, DecomposeReduceProbTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::prod)}

// Dynamic shape reduce to non-scalar output, don't decompose reduce.
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape_skip) {
const ov::PartialShape in_shape = {4, -1, -1, 10};
const ov::element::Type in_type = ov::element::Type_t::f16;
const std::vector<size_t> reduction_axes = {2};
{
model = build_model(in_shape, in_type, reduction_axes, false, reduce_mode::max);
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>();
}
{ model_ref = build_model(in_shape, in_type, reduction_axes, false, reduce_mode::max); }
}
Loading