-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GPU] optimize ReduceMax pattern #24073
Changes from 10 commits
7dd4c60
d858099
9e44524
c3a53b8
14526ff
8d4fb47
0cfcd3c
5d98877
521ddf5
498225a
a28c826
b735cfe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "decompose_reduce_scalar_output.hpp" | ||
|
||
#include <algorithm> | ||
#include <memory> | ||
#include <vector> | ||
|
||
#include "openvino/core/node.hpp" | ||
#include "openvino/core/rt_info.hpp" | ||
#include "openvino/op/constant.hpp" | ||
#include "openvino/op/reduce_max.hpp" | ||
#include "openvino/op/reduce_mean.hpp" | ||
#include "openvino/op/reduce_min.hpp" | ||
#include "openvino/op/reduce_prod.hpp" | ||
#include "openvino/op/reduce_sum.hpp" | ||
#include "openvino/pass/pattern/op/wrap_type.hpp" | ||
#include "transformations/utils/utils.hpp" | ||
|
||
#define CREATE_REDUCE(input, reduce_const, keep_dims) \ | ||
if (ov::is_type<ov::op::v1::ReduceSum>(reduce_orig)) \ | ||
reduce_new = std::make_shared<ov::op::v1::ReduceSum>(input, reduce_const, keep_dims); \ | ||
else if (ov::is_type<ov::op::v1::ReduceMin>(reduce_orig)) \ | ||
reduce_new = std::make_shared<ov::op::v1::ReduceMin>(input, reduce_const, keep_dims); \ | ||
else if (ov::is_type<ov::op::v1::ReduceMax>(reduce_orig)) \ | ||
reduce_new = std::make_shared<ov::op::v1::ReduceMax>(input, reduce_const, keep_dims); \ | ||
else if (ov::is_type<ov::op::v1::ReduceProd>(reduce_orig)) \ | ||
reduce_new = std::make_shared<ov::op::v1::ReduceProd>(input, reduce_const, keep_dims); \ | ||
else \ | ||
return false; | ||
|
||
ov::intel_gpu::DecomposeReduceForScalarOutput::DecomposeReduceForScalarOutput() { | ||
auto check_reduce_shape = [=](Output<Node> output) -> bool { | ||
auto reduce_shape = output.get_partial_shape(); | ||
if (reduce_shape.is_dynamic() || reduce_shape.size() != 1) { | ||
return false; | ||
} else if (reduce_shape.to_shape()[0] <= 1) { | ||
return false; | ||
} | ||
return true; | ||
}; | ||
auto reduce_pattern = ov::pass::pattern::wrap_type<ov::op::v1::ReduceSum, | ||
ov::op::v1::ReduceMean, | ||
ov::op::v1::ReduceProd, | ||
ov::op::v1::ReduceMin, | ||
ov::op::v1::ReduceMax>( | ||
{ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type<ov::op::v0::Constant>(check_reduce_shape)}); | ||
|
||
// register callback | ||
ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { | ||
const auto& pattern_map = m.get_pattern_value_map(); | ||
auto reduce_orig = | ||
as_type_ptr<op::util::ArithmeticReductionKeepDims>(pattern_map.at(reduce_pattern).get_node_shared_ptr()); | ||
if (!reduce_orig || transformation_callback(reduce_orig)) | ||
return false; | ||
|
||
const auto input_shape = reduce_orig->input_value(0).get_partial_shape(); | ||
const auto reduce_shape = reduce_orig->input_value(1).get_partial_shape(); | ||
if (reduce_shape.to_shape()[0] != input_shape.size()) | ||
return false; | ||
|
||
auto dynamic_shape = false; | ||
const auto output_shape = reduce_orig->get_output_partial_shape(0); | ||
if (input_shape.is_dynamic() || output_shape.is_dynamic()) { | ||
dynamic_shape = true; | ||
} | ||
|
||
std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce_new = nullptr; | ||
if (!dynamic_shape) { | ||
// Output size decides at most how many EU threads can be used for this node execution, | ||
// less than 4 EU threads to execute a primitive will lead to poor performance. | ||
if (ov::shape_size(output_shape.to_shape()) > 4) { | ||
return false; | ||
} | ||
// Input shape is too small, 1 EU thread should be enough. | ||
const auto input_static_shape = input_shape.to_shape(); | ||
if (ov::shape_size(input_static_shape) < 64) { | ||
return false; | ||
} | ||
|
||
// Find out the the most length dimension | ||
size_t max_dim = std::distance(input_static_shape.begin(), | ||
std::max_element(input_static_shape.begin(), input_static_shape.end())); | ||
if (input_static_shape[max_dim] == ov::shape_size(input_static_shape)) { | ||
return false; | ||
} | ||
|
||
CREATE_REDUCE(reduce_orig->input_value(0), | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {max_dim}), | ||
true); | ||
|
||
} else if (input_shape.rank().is_static()) { | ||
// Dynamic shape and output shape is [0], which will lead to 1 EU thread to do all work. | ||
auto input = reduce_orig->input_value(0); | ||
for (size_t i = input_shape.size() - 1; i > 0; i--) { | ||
// Reduce one dimension by one dimension to avoid 1 EU thread do all work. | ||
if (input_shape[i].is_dynamic() || (input_shape[i].is_static() && input_shape[i].get_length() >= 4)) { | ||
CREATE_REDUCE(input, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {i}), true); | ||
input = reduce_new->get_default_output(); | ||
} | ||
} | ||
} | ||
if (!reduce_new) | ||
return false; | ||
|
||
CREATE_REDUCE(reduce_new->get_default_output(), reduce_orig->input_value(1), reduce_orig->get_keep_dims()); | ||
reduce_new->set_friendly_name(reduce_orig->get_friendly_name()); | ||
copy_runtime_info(reduce_orig, reduce_new); | ||
replace_node(reduce_orig, reduce_new); | ||
return true; | ||
}; | ||
|
||
auto m = std::make_shared<ov::pass::pattern::Matcher>(reduce_pattern, "DecomposeReduceForScalarOutput"); | ||
register_matcher(m, callback); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
|
||
#include "openvino/core/visibility.hpp" | ||
#include "openvino/pass/graph_rewrite.hpp" | ||
|
||
namespace ov { | ||
namespace intel_gpu { | ||
|
||
// In some case, Reduce OP is used to reduce one 2D/3D/4D/5D tensor to a scalar output, which leads to all computation | ||
// are executed in single EU thread due to only one output, then fall in very poor performance. This pattern is used to | ||
// detect this case and decompose Reduce by dimension to avoid poor performance. | ||
class DecomposeReduceForScalarOutput : public ov::pass::MatcherPass { | ||
public: | ||
OPENVINO_RTTI("DecomposeReduceForScalarOutput", "0"); | ||
DecomposeReduceForScalarOutput(); | ||
}; | ||
|
||
} // namespace intel_gpu | ||
} // namespace ov |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include <gtest/gtest.h> | ||
|
||
#include <memory> | ||
#include <openvino/core/model.hpp> | ||
#include <openvino/opsets/opset9.hpp> | ||
#include <openvino/pass/manager.hpp> | ||
#include <plugin/transformations/decompose_reduce_scalar_output.hpp> | ||
#include <string> | ||
|
||
#include "common_test_utils/ov_test_utils.hpp" | ||
#include "intel_gpu/primitives/reduce.hpp" | ||
|
||
using namespace testing; | ||
using namespace ov::intel_gpu; | ||
using namespace cldnn; | ||
using ReduceType = cldnn::reduce_mode; | ||
|
||
#define create_reduce(arg, reduction, keep_dims, reduce_type) \ | ||
if (reduce_type == reduce_mode::sum) \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please don't use cldnn things in transformation unit tests. You can pass op type to this macro instead of enum value:
Also, I'd suggest replacing macro with template There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
reduce = std::make_shared<ov::op::v1::ReduceSum>(arg, reduction, keep_dims); \ | ||
else if (reduce_type == reduce_mode::min) \ | ||
reduce = std::make_shared<ov::op::v1::ReduceMin>(arg, reduction, keep_dims); \ | ||
else if (reduce_type == reduce_mode::max) \ | ||
reduce = std::make_shared<ov::op::v1::ReduceMax>(arg, reduction, keep_dims); \ | ||
else if (reduce_type == reduce_mode::prod) \ | ||
reduce = std::make_shared<ov::op::v1::ReduceProd>(arg, reduction, keep_dims); \ | ||
OPENVINO_ASSERT(reduce != nullptr, "cannot create reduce: ", static_cast<int>(reduce_type)); | ||
|
||
static std::shared_ptr<ov::Model> build_model(const ov::PartialShape& input_shape, | ||
const ov::element::Type& input_type, | ||
const std::vector<size_t>& reduction_axes, | ||
const bool keep_dim, | ||
const ReduceType reduce_type) { | ||
const auto in = std::make_shared<ov::op::v0::Parameter>(input_type, input_shape); | ||
std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce = nullptr; | ||
create_reduce(in->get_default_output(), | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), | ||
keep_dim, | ||
reduce_type); | ||
|
||
return std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); | ||
} | ||
|
||
#define decompose_reduce_static_shape(reduce_type) \ | ||
const ov::PartialShape in_shape = {1, 256, 1024, 10}; \ | ||
const ov::element::Type in_type = ov::element::Type_t::f16; \ | ||
const std::vector<size_t> reduction_axes = {0, 1, 2, 3}; \ | ||
disable_rt_info_check(); \ | ||
{ \ | ||
model = build_model(in_shape, in_type, reduction_axes, false, reduce_type); \ | ||
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); \ | ||
} \ | ||
{ \ | ||
const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape); \ | ||
std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce = nullptr; \ | ||
create_reduce(in->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \ | ||
true, \ | ||
reduce_type); \ | ||
create_reduce( \ | ||
reduce->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \ | ||
false, \ | ||
reduce_mode::max); \ | ||
model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); \ | ||
} | ||
|
||
// Static shape reduce to scalar output, decompose reduce. | ||
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape){decompose_reduce_static_shape(reduce_mode::max)} | ||
|
||
TEST_F(TransformationTestsF, DecomposeReduceMinTest_static_shape){decompose_reduce_static_shape(reduce_mode::min)} | ||
|
||
TEST_F(TransformationTestsF, DecomposeReduceSumTest_static_shape){decompose_reduce_static_shape(reduce_mode::sum)} | ||
|
||
TEST_F(TransformationTestsF, DecomposeReduceProbTest_static_shape){decompose_reduce_static_shape(reduce_mode::prod)} | ||
|
||
// Static shape reduce to non scalar output, don't decompose reduce. | ||
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_skip) { | ||
const ov::PartialShape in_shape = {256, 1024, 10}; | ||
const ov::element::Type in_type = ov::element::Type_t::f16; | ||
const std::vector<size_t> reduction_axes = {1}; | ||
{ | ||
model = build_model(in_shape, in_type, reduction_axes, true, reduce_mode::max); | ||
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); | ||
} | ||
{ model_ref = build_model(in_shape, in_type, reduction_axes, true, reduce_mode::max); } | ||
} | ||
|
||
// Dynamic shape reduce to scalar output, decompose reduce. | ||
#define decompose_reduce_dynamic_shape(reduce_type) \ | ||
const ov::PartialShape in_shape = {4, -1, -1, 10}; \ | ||
const ov::element::Type in_type = ov::element::Type_t::f16; \ | ||
const std::vector<size_t> reduction_axes = {0, 1, 2, 3}; \ | ||
disable_rt_info_check(); \ | ||
{ \ | ||
model = build_model(in_shape, in_type, reduction_axes, false, reduce_mode::max); \ | ||
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); \ | ||
} \ | ||
{ \ | ||
const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape); \ | ||
std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce = nullptr; \ | ||
create_reduce(in->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), \ | ||
true, \ | ||
reduce_type); \ | ||
create_reduce(reduce->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \ | ||
true, \ | ||
reduce_type); \ | ||
create_reduce(reduce->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}), \ | ||
true, \ | ||
reduce_type); \ | ||
create_reduce( \ | ||
reduce->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \ | ||
false, \ | ||
reduce_type); \ | ||
model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); \ | ||
} | ||
|
||
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::max)} | ||
|
||
TEST_F(TransformationTestsF, DecomposeReduceMinTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::min)} | ||
|
||
TEST_F(TransformationTestsF, DecomposeReduceSumTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::sum)} | ||
|
||
TEST_F(TransformationTestsF, DecomposeReduceProbTest_dynamic_shape){decompose_reduce_dynamic_shape(reduce_mode::prod)} | ||
|
||
// Dynamic shape reduce to non-scalar output, don't decompose reduce. | ||
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape_skip) { | ||
const ov::PartialShape in_shape = {4, -1, -1, 10}; | ||
const ov::element::Type in_type = ov::element::Type_t::f16; | ||
const std::vector<size_t> reduction_axes = {2}; | ||
{ | ||
model = build_model(in_shape, in_type, reduction_axes, false, reduce_mode::max); | ||
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); | ||
} | ||
{ model_ref = build_model(in_shape, in_type, reduction_axes, false, reduce_mode::max); } | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: that can also be a part of reduce predicate I believe
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done