-
Notifications
You must be signed in to change notification settings - Fork 2.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[GPU] optimize ReduceMax pattern (#24073)
### Details - Optimize ReduceMax pattern to avoid scheduling the whole primitive executed in single EU Sometimes ReduceMax OP is used to convert 3D/4D shape tensor to a scalar output, which leads to all computation are executed in single EUs due to only one output. It causes very poor performance for some models. For example: Grounding DINO model `ReduceMax cost 59.24 ms and cosumed 49% execution time out of whole models.` To break this bottleneck, this PR applies more EUs to execute this primitive by doing ReduceMax one dimension by one dimension. We also notice that the ReduceMax OP selects ref-kernel rather than opt-kernel, which may also cause some performance issue. But it seems the ReduceMax OP doesn't need too much computation, ref-kernel should be enough. The key problem should be only one EU is scheduled to do the whole ReduceMax computation, which is the root cause of poor performace. ![image](https://github.com/openvinotoolkit/openvino/assets/31196718/561b98cc-98af-44a9-9ec2-36e5c63de797) ![image](https://github.com/openvinotoolkit/openvino/assets/31196718/d35c33f9-0714-4871-b049-354ccacd95ea) Test result shows: ReduceMax will be improved from 59.24ms to 2.25ms, fps from 8.24 to 15.55 (+88% improvement) ![image](https://github.com/openvinotoolkit/openvino/assets/31196718/4b981bc5-251c-4913-a5d1-84d5bbd5aec8) ### Tickets: - *145690*
- Loading branch information
1 parent
c6ab24a
commit 8b82aae
Showing
4 changed files
with
276 additions
and
0 deletions.
There are no files selected for viewing
111 changes: 111 additions & 0 deletions
111
src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "decompose_reduce_scalar_output.hpp" | ||
|
||
#include <algorithm> | ||
#include <memory> | ||
#include <vector> | ||
|
||
#include "openvino/core/node.hpp" | ||
#include "openvino/core/rt_info.hpp" | ||
#include "openvino/op/constant.hpp" | ||
#include "openvino/op/reduce_max.hpp" | ||
#include "openvino/op/reduce_min.hpp" | ||
#include "openvino/op/reduce_prod.hpp" | ||
#include "openvino/op/reduce_sum.hpp" | ||
#include "openvino/pass/pattern/op/wrap_type.hpp" | ||
#include "transformations/utils/utils.hpp" | ||
|
||
#define CREATE_REDUCE(input, reduce_const, keep_dims) \ | ||
if (ov::is_type<ov::op::v1::ReduceSum>(reduce_orig)) \ | ||
reduce_new = std::make_shared<ov::op::v1::ReduceSum>(input, reduce_const, keep_dims); \ | ||
else if (ov::is_type<ov::op::v1::ReduceMin>(reduce_orig)) \ | ||
reduce_new = std::make_shared<ov::op::v1::ReduceMin>(input, reduce_const, keep_dims); \ | ||
else if (ov::is_type<ov::op::v1::ReduceMax>(reduce_orig)) \ | ||
reduce_new = std::make_shared<ov::op::v1::ReduceMax>(input, reduce_const, keep_dims); \ | ||
else if (ov::is_type<ov::op::v1::ReduceProd>(reduce_orig)) \ | ||
reduce_new = std::make_shared<ov::op::v1::ReduceProd>(input, reduce_const, keep_dims); \ | ||
else \ | ||
return false; | ||
|
||
ov::intel_gpu::DecomposeReduceForScalarOutput::DecomposeReduceForScalarOutput() { | ||
auto check_reduce_shape = [=](Output<Node> output) -> bool { | ||
const auto reduce = ov::as_type_ptr<op::util::ArithmeticReductionKeepDims>(output.get_node_shared_ptr()); | ||
const auto input_shape = reduce->input_value(0).get_partial_shape(); | ||
const auto reduce_shape = reduce->input_value(1).get_partial_shape(); | ||
if (reduce_shape.is_dynamic() || reduce_shape.size() != 1) { | ||
return false; | ||
} else if (reduce_shape.to_shape()[0] <= 1 || reduce_shape.to_shape()[0] != input_shape.size()) { | ||
return false; | ||
} | ||
const auto output_shape = reduce->get_output_partial_shape(0); | ||
if (output_shape.is_static() && input_shape.is_static()) { | ||
// Output size decides at most how many EU threads can be used for this node execution, | ||
// less than 4 EU threads to execute a primitive will lead to poor performance. | ||
if (ov::shape_size(output_shape.to_shape()) > 4) { | ||
return false; | ||
} | ||
// Input shape is too small, 1 EU thread should be enough. | ||
const auto input_static_shape = input_shape.to_shape(); | ||
if (ov::shape_size(input_static_shape) < 64) { | ||
return false; | ||
} | ||
} | ||
return true; | ||
}; | ||
|
||
auto reduce_pattern = ov::pass::pattern:: | ||
wrap_type<ov::op::v1::ReduceSum, ov::op::v1::ReduceProd, ov::op::v1::ReduceMin, ov::op::v1::ReduceMax>( | ||
{ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type<ov::op::v0::Constant>()}, | ||
check_reduce_shape); | ||
|
||
// register callback | ||
ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { | ||
const auto& pattern_map = m.get_pattern_value_map(); | ||
auto reduce_orig = | ||
as_type_ptr<op::util::ArithmeticReductionKeepDims>(pattern_map.at(reduce_pattern).get_node_shared_ptr()); | ||
if (!reduce_orig || transformation_callback(reduce_orig)) | ||
return false; | ||
|
||
const auto input_shape = reduce_orig->input_value(0).get_partial_shape(); | ||
const auto output_shape = reduce_orig->get_output_partial_shape(0); | ||
bool dynamic_shape = input_shape.is_dynamic() || output_shape.is_dynamic(); | ||
std::shared_ptr<ov::op::util::ArithmeticReductionKeepDims> reduce_new = nullptr; | ||
if (!dynamic_shape) { | ||
// Find out the the most length dimension | ||
const auto input_static_shape = input_shape.to_shape(); | ||
size_t max_dim = std::distance(input_static_shape.begin(), | ||
std::max_element(input_static_shape.begin(), input_static_shape.end())); | ||
if (input_static_shape[max_dim] == ov::shape_size(input_static_shape)) { | ||
return false; | ||
} | ||
CREATE_REDUCE(reduce_orig->input_value(0), | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {max_dim}), | ||
true); | ||
|
||
} else if (input_shape.rank().is_static()) { | ||
// Dynamic shape and output shape is [0], which will lead to 1 EU thread to do all work. | ||
auto input = reduce_orig->input_value(0); | ||
for (size_t i = input_shape.size() - 1; i > 0; i--) { | ||
// Reduce one dimension by one dimension to avoid 1 EU thread do all work. | ||
if (input_shape[i].is_dynamic() || (input_shape[i].is_static() && input_shape[i].get_length() >= 4)) { | ||
CREATE_REDUCE(input, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {i}), true); | ||
input = reduce_new->get_default_output(); | ||
} | ||
} | ||
} | ||
if (!reduce_new) | ||
return false; | ||
|
||
CREATE_REDUCE(reduce_new->get_default_output(), reduce_orig->input_value(1), reduce_orig->get_keep_dims()); | ||
reduce_new->set_friendly_name(reduce_orig->get_friendly_name()); | ||
copy_runtime_info(reduce_orig, reduce_new); | ||
replace_node(reduce_orig, reduce_new); | ||
return true; | ||
}; | ||
|
||
auto m = std::make_shared<ov::pass::pattern::Matcher>(reduce_pattern, "DecomposeReduceForScalarOutput"); | ||
register_matcher(m, callback); | ||
} |
23 changes: 23 additions & 0 deletions
23
src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_scalar_output.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
|
||
#include "openvino/core/visibility.hpp" | ||
#include "openvino/pass/graph_rewrite.hpp" | ||
|
||
namespace ov { | ||
namespace intel_gpu { | ||
|
||
// In some case, Reduce OP is used to reduce one 2D/3D/4D/5D tensor to a scalar output, which leads to all computation | ||
// are executed in single EU thread due to only one output, then fall in very poor performance. This pattern is used to | ||
// detect this case and decompose Reduce by dimension to avoid poor performance. | ||
class DecomposeReduceForScalarOutput : public ov::pass::MatcherPass { | ||
public: | ||
OPENVINO_RTTI("DecomposeReduceForScalarOutput", "0"); | ||
DecomposeReduceForScalarOutput(); | ||
}; | ||
|
||
} // namespace intel_gpu | ||
} // namespace ov |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
140 changes: 140 additions & 0 deletions
140
src/plugins/intel_gpu/tests/unit/transformations/decompose_reduce_scalar_output_test.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include <gtest/gtest.h> | ||
|
||
#include <memory> | ||
#include <openvino/core/model.hpp> | ||
#include <openvino/core/node_output.hpp> | ||
#include <openvino/opsets/opset9.hpp> | ||
#include <openvino/pass/manager.hpp> | ||
#include <plugin/transformations/decompose_reduce_scalar_output.hpp> | ||
#include <string> | ||
|
||
#include "common_test_utils/ov_test_utils.hpp" | ||
#include "intel_gpu/primitives/reduce.hpp" | ||
|
||
using namespace testing; | ||
using namespace ov::intel_gpu; | ||
using namespace ov; | ||
|
||
template <class T> | ||
std::shared_ptr<ov::Model> build_model(const ov::PartialShape& input_shape, | ||
const ov::element::Type& input_type, | ||
const std::vector<size_t>& reduction_axes, | ||
const bool keep_dim) { | ||
const auto in = std::make_shared<ov::op::v0::Parameter>(input_type, input_shape); | ||
auto reduce = std::make_shared<T>( | ||
in->get_default_output(), | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), | ||
keep_dim); | ||
return std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); | ||
} | ||
|
||
#define decompose_reduce_static_shape(reduce_type) \ | ||
const ov::PartialShape in_shape = {1, 256, 1024, 10}; \ | ||
const ov::element::Type in_type = ov::element::Type_t::f16; \ | ||
const std::vector<size_t> reduction_axes = {0, 1, 2, 3}; \ | ||
disable_rt_info_check(); \ | ||
{ \ | ||
model = build_model<reduce_type>(in_shape, in_type, reduction_axes, false); \ | ||
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); \ | ||
} \ | ||
{ \ | ||
const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape); \ | ||
auto reduce = std::make_shared<reduce_type>(in->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \ | ||
true); \ | ||
reduce = std::make_shared<reduce_type>( \ | ||
reduce->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \ | ||
false); \ | ||
model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); \ | ||
} | ||
|
||
// Static shape reduce to scalar output, decompose reduce. | ||
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceMax)} | ||
|
||
TEST_F(TransformationTestsF, DecomposeReduceMinTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceMin)} | ||
|
||
TEST_F(TransformationTestsF, DecomposeReduceSumTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceSum)} | ||
|
||
TEST_F(TransformationTestsF, | ||
DecomposeReduceProbTest_static_shape){decompose_reduce_static_shape(ov::op::v1::ReduceProd)} | ||
|
||
// Static shape with small input, don't decompose reduce. | ||
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_small_input_skip) { | ||
const ov::PartialShape in_shape = {1, 2, 8, 2}; | ||
const ov::element::Type in_type = ov::element::Type_t::f16; | ||
const std::vector<size_t> reduction_axes = {0, 1, 2, 3}; | ||
{ | ||
model = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true); | ||
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); | ||
} | ||
{ model_ref = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true); } | ||
} | ||
|
||
// Static shape reduce to non scalar output, don't decompose reduce. | ||
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_static_shape_skip) { | ||
const ov::PartialShape in_shape = {256, 1024, 10}; | ||
const ov::element::Type in_type = ov::element::Type_t::f16; | ||
const std::vector<size_t> reduction_axes = {1}; | ||
{ | ||
model = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true); | ||
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); | ||
} | ||
{ model_ref = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, true); } | ||
} | ||
|
||
// Dynamic shape reduce to scalar output, decompose reduce. | ||
#define decompose_reduce_dynamic_shape(reduce_type) \ | ||
const ov::PartialShape in_shape = {4, -1, -1, 10}; \ | ||
const ov::element::Type in_type = ov::element::Type_t::f16; \ | ||
const std::vector<size_t> reduction_axes = {0, 1, 2, 3}; \ | ||
disable_rt_info_check(); \ | ||
{ \ | ||
model = build_model<reduce_type>(in_shape, in_type, reduction_axes, false); \ | ||
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); \ | ||
} \ | ||
{ \ | ||
const auto in = std::make_shared<ov::op::v0::Parameter>(in_type, in_shape); \ | ||
auto reduce = std::make_shared<reduce_type>(in->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), \ | ||
true); \ | ||
reduce = std::make_shared<reduce_type>(reduce->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), \ | ||
true); \ | ||
reduce = std::make_shared<reduce_type>(reduce->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}), \ | ||
true); \ | ||
reduce = std::make_shared<reduce_type>( \ | ||
reduce->get_default_output(), \ | ||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reduction_axes.size()}, reduction_axes), \ | ||
false); \ | ||
model_ref = std::make_shared<ov::Model>(ov::NodeVector{reduce}, ov::ParameterVector{in}); \ | ||
} | ||
|
||
TEST_F(TransformationTestsF, | ||
DecomposeReduceMaxTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceMax)} | ||
|
||
TEST_F(TransformationTestsF, | ||
DecomposeReduceMinTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceMin)} | ||
|
||
TEST_F(TransformationTestsF, | ||
DecomposeReduceSumTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceSum)} | ||
|
||
TEST_F(TransformationTestsF, | ||
DecomposeReduceProbTest_dynamic_shape){decompose_reduce_dynamic_shape(ov::op::v1::ReduceProd)} | ||
|
||
// Dynamic shape reduce to non-scalar output, don't decompose reduce. | ||
TEST_F(TransformationTestsF, DecomposeReduceMaxTest_dynamic_shape_skip) { | ||
const ov::PartialShape in_shape = {4, -1, -1, 10}; | ||
const ov::element::Type in_type = ov::element::Type_t::f16; | ||
const std::vector<size_t> reduction_axes = {2}; | ||
{ | ||
model = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, false); | ||
manager.register_pass<ov::intel_gpu::DecomposeReduceForScalarOutput>(); | ||
} | ||
{ model_ref = build_model<ov::op::v1::ReduceMax>(in_shape, in_type, reduction_axes, false); } | ||
} |