Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU][ARM] Enable NHWC in Reduce #23108

Merged
merged 18 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,36 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs,

this->reduceAttrs = reduceAttrs;

auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto dstDims = dstDescs[0]->getShape().getStaticDims();
const auto& srcDims = srcDescs[0]->getShape().getStaticDims();
const auto& dstDims = dstDescs[0]->getShape().getStaticDims();
bool hasSrcNspcLayout = srcDescs[0]->hasLayoutType(LayoutType::nspc);
bool hasDstNspcLayout = dstDescs[0]->hasLayoutType(LayoutType::nspc);
auto srcShape = shapeCast(srcDims);
auto dstShape = shapeCast(dstDims);
if (hasSrcNspcLayout && hasDstNspcLayout) {
changeLayoutToNH_C({&srcShape, &dstShape});
}

TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
TensorInfo srcTensorInfo = TensorInfo(srcShape, 1,
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
TensorInfo dstTensorInfo = TensorInfo(dstShape, 1,
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));

srcTensor.allocator()->init(srcTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);

std::function<std::unique_ptr<IFunction>(void)> exec_func;
std::vector<int> castedAxes;
for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) {
int axis = axisCast(reduceAttrs.axes[i], srcDims.size(), hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION);
if (hasSrcNspcLayout && axis == -1) return false;
castedAxes.push_back(axis);
}
switch (reduceAttrs.operation) {
case Algorithm::ReduceMean: {
for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) {
auto axe = axisCast(reduceAttrs.axes[i], srcDims.size());
auto pos = axisCast(i, reduceAttrs.axes.size());
axesMean.set(pos, axe);
axesMean.set(pos, castedAxes[i]);
}
Status reduceMeanStatus = NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo);
if (!reduceMeanStatus) {
Expand All @@ -71,15 +83,15 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs,
case Algorithm::ReduceMin:
case Algorithm::ReduceSum:
case Algorithm::ReduceProd: {
Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, axisCast(reduceAttrs.axes[0], srcDims.size()),
Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, castedAxes[0],
getAclReductionOperationByAlgorithm(reduceAttrs.operation), reduceAttrs.keepDims);
if (!reductionOperationStatus) {
DEBUG_LOG("NEReductionOperation validation with indices failed: ", reductionOperationStatus.error_description());
return false;
}
exec_func = [this, srcDims]() -> std::unique_ptr<IFunction> {
exec_func = [this, castedAxes]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<arm_compute::NEReductionOperation>();
acl_op->configure(&srcTensor, &dstTensor, axisCast(this->reduceAttrs.axes[0], srcDims.size()),
acl_op->configure(&srcTensor, &dstTensor, castedAxes[0],
getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), this->reduceAttrs.keepDims);
return acl_op;
};
Expand All @@ -103,4 +115,4 @@ void AclReduceExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vect
}

} // namespace intel_cpu
} // namespace ov
} // namespace ov
29 changes: 27 additions & 2 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,33 @@ inline arm_compute::TensorShape shapeCast(const VectorDims& dims) {
return tensorShape;
}

inline std::size_t axisCast(const std::size_t axis, const std::size_t shapeSize) {
return shapeSize - axis - 1;
enum ACLAxisCastMode {
NO_LAYOUT_CONVERSION,
NHWC_TO_NCHW,
NCHW_TO_NHWC
};

/**
* @brief Return reverted axis used in ACL. If axis cast mode is
* @param axis axis that needs to be converted
* @param shapeSize size of the shape, which axis needs to be converted
* @param axisCastMode specifies whether layout conversion is required or not
* @return reverted axis
*/
inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxisCastMode axisCastMode = NO_LAYOUT_CONVERSION) {
// CWHN (reverted NHWC) (0, 1, 2, 3) into WHCN (reverted NCHW) (1, 2, 0, 3)
static std::vector<size_t> nhwcToNchw = {1, 2, 0, 3};
// WHCN (reverted NCHW) (0, 1, 2, 3) into CWHN (reverted NHWC) (2, 0, 1, 3)
static std::vector<size_t> nchwToNhwc = {2, 0, 1, 3};
size_t revertedAxis = shapeSize - axis - 1;
switch (axisCastMode) {
case NHWC_TO_NCHW:
return revertedAxis > 3 ? -1 : nhwcToNchw[revertedAxis];
case NCHW_TO_NHWC:
return revertedAxis > 3 ? -1 : nchwToNhwc[revertedAxis];
default:
return revertedAxis;
}
}

inline Dim vectorProduct(const VectorDims& vec, size_t size) {
Expand Down
3 changes: 1 addition & 2 deletions src/plugins/intel_cpu/src/nodes/reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1983,8 +1983,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {
if (axis < 0)
axis += static_cast<int>(getInputShapeAtPort(REDUCE_DATA).getRank());
}
// TODO: Per-channel layout is disabled due to accuracy issue in ACL Reduce Executor
// pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, undef, true);
pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, impl_desc_type::undef, true);
pushDesc(LayoutType::ncsp, LayoutType::ncsp, input_prec, output_prec, impl_desc_type::undef, true);
canUseAclExecutor = !supportedPrimitiveDescriptors.empty();
if (canUseAclExecutor)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (C) 2020-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0


#include "convert_reduce_no_keep_dims.hpp"

#include "openvino/core/rt_info.hpp"
#include "openvino/opsets/opset8.hpp"

template <class T>
ov::matcher_pass_callback ov::intel_cpu::ConvertReduceNoKeepDimsBase::convert_reduce() {
return [&](ov::pass::pattern::Matcher& m) {
auto reduce = std::dynamic_pointer_cast<T>(m.get_match_root());
if (!reduce || reduce->get_keep_dims()) {
return false;
}

reduce->set_keep_dims(true);
const auto reduce_new = reduce->clone_with_new_inputs({reduce->input_value(0), reduce->input_value(1)});
std::shared_ptr<ov::Node> squeeze = std::make_shared<ov::op::v0::Squeeze>(reduce_new, reduce->input_value(1));
squeeze->set_friendly_name(reduce_new->get_friendly_name());
ov::copy_runtime_info(reduce, {reduce_new, squeeze});
ov::replace_node(reduce, squeeze);

return true;
};
}

template <typename ReductionType>
ov::intel_cpu::ConvertReduction<ReductionType>::ConvertReduction() {
auto m = std::make_shared<ov::pass::pattern::Matcher>(
ov::pass::pattern::wrap_type<ReductionType>({ov::pass::pattern::any_input(),
ov::pass::pattern::wrap_type<ov::opset8::Constant>()}), "ConvertReduction");
register_matcher(m, convert_reduce<ReductionType>());
}

template class ov::intel_cpu::ConvertReduction<ov::op::util::LogicalReductionKeepDims>;
template class ov::intel_cpu::ConvertReduction<ov::op::util::ArithmeticReductionKeepDims>;
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright (C) 2020-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "openvino/pass/graph_rewrite.hpp"
#include "openvino/op/util/arithmetic_reductions_keep_dims.hpp"
#include "openvino/op/util/logical_reduction_keep_dims.hpp"

/*
* Description:
* ConvertReduceNoKeepDimsBase detects Reduce operations with keepDims = false.
* Such Reduce operation is replaced with Reduce operation with keepDims = true and Squeeze
* which removes undesired dimensions.
Comment on lines +12 to +15
Copy link
Contributor

@EgorDuplensky EgorDuplensky Jun 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@itikhono Could you please share your opinion regarding the appropriate place for this transformation? We do have an inverted one (where we fuse squeeze into reduce instead) in the common transformations scope. Isn't it better to place this one into the location of the common transformations as well?
I am not proposing to do this in scope of the current PR, but we could do it right after.

*
* Before:
*
* +--------------+ +-----------------+
* | Data | | Axes tensor |
* +-----------+--+ +-+---------------+
* | |
* +---------------------------+
* | Reduce (keepDims = false) |
* +---------------------------+
*
* After:
*
* +--------------+ +-----------------+
* | Data | | Axes tensor |
* +-----------+--+ +-+------------+--+
* | | |
* +---------------------------+ |
* | Reduce (keepDims = true) | |
* +-----------------------+---+ |
* | |
* +--------v------v-+
* | Squeeze |
* +-----------------+
*
*/

namespace ov {
namespace intel_cpu {

class ConvertReduceNoKeepDimsBase: public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("ConvertReduceNoKeepDims", "0");
template <class T>
ov::matcher_pass_callback convert_reduce();
};

template <typename ReductionType>
class ConvertReduction: public ConvertReduceNoKeepDimsBase {
public:
OPENVINO_RTTI("ConvertReduction", "0");
ConvertReduction();
};


class ConvertReduceNoKeepDims: public ov::pass::GraphRewrite {
public:
OPENVINO_RTTI("ConvertReduceNoKeepDims", "0");
ConvertReduceNoKeepDims() {
add_matcher<ConvertReduction<ov::op::util::LogicalReductionKeepDims>>();
add_matcher<ConvertReduction<ov::op::util::ArithmeticReductionKeepDims>>();
}
};

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@
#include "transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp"
#include "transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp"
#include "transformations/cpu_opset/arm/pass/mish_decomposition.hpp"
#include "transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp"
#include "transformations/cpu_opset/common/pass/decompose_integer_divide.hpp"
#include "transformations/cpu_opset/common/pass/convert_fq_rnn_to_quantized_rnn.hpp"
#include "transformations/cpu_opset/common/pass/insert_convert_after_extension.hpp"
Expand Down Expand Up @@ -416,6 +417,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
CPU_REGISTER_PASS_COMMON(manager, SwapConvertTranspose);
CPU_REGISTER_PASS_X64(manager, ConvertToInteraction);
CPU_REGISTER_PASS_X64(manager, ConvertInteractionInt8);
CPU_REGISTER_PASS_ARM(manager, ConvertReduceNoKeepDims);
CPU_REGISTER_PASS_ARM(manager, ConvertReduceMultiAxis);
CPU_REGISTER_PASS_ARM(manager, MishDecomposition);
CPU_REGISTER_PASS_ARM(manager, ConvertConv1D);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,7 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_SingleBatch = {

std::vector<CPUSpecificParams> cpuParams_4D = {
CPUSpecificParams({nchw}, {nchw}, {}, {}),
//NHWC layout is disabled on ARM due to accuracy issue: https://github.com/ARM-software/ComputeLibrary/issues/1044
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
CPUSpecificParams({nhwc}, {nhwc}, {}, {}),
#endif
};

/* ================================ 1.1 No fusion - Arithmetic ================================ */
Expand Down Expand Up @@ -160,4 +157,4 @@ INSTANTIATE_TEST_SUITE_P(

} // namespace Reduce
} // namespace test
} // namespace ov
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <gtest/gtest.h>

#include <openvino/opsets/opset1.hpp>
#include <transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp>
#include "common_test_utils/ov_test_utils.hpp"

using namespace ov::intel_cpu;

template <class T>
class ConvertReduceNoKeepDimsTest : public testing::Test {};

template <class T>
static std::shared_ptr<ov::Model> createInitGraph(std::shared_ptr<ov::opset1::Parameter> param) {
auto axes = ov::opset1::Constant::create(ov::element::i64, ov::Shape{2}, {0, 1});
auto reduce = std::make_shared<T>(param, axes, false);
return std::make_shared<ov::Model>(ov::NodeVector{ reduce }, ov::ParameterVector{ param });
}

template <class T>
static std::shared_ptr<ov::Model> createRefGraph(std::shared_ptr<ov::opset1::Parameter> param) {
auto axes = ov::opset1::Constant::create(ov::element::i64, ov::Shape{2}, {0, 1});
auto reduce = std::make_shared<T>(param, axes, true);
auto squeeze = std::make_shared<ov::opset1::Squeeze>(reduce, axes);
return std::make_shared<ov::Model>(ov::NodeVector{ squeeze }, ov::ParameterVector{ param });
}

template <class T>
static bool registerAndRunReducePass(std::shared_ptr<ov::Model> model) {
ov::pass::Manager manager;
if (std::is_base_of<ov::op::util::LogicalReductionKeepDims, T>::value) {
manager.register_pass<ConvertReduction<ov::op::util::LogicalReductionKeepDims>>();
} else if (std::is_base_of<ov::op::util::ArithmeticReductionKeepDims, T>::value) {
manager.register_pass<ConvertReduction<ov::op::util::ArithmeticReductionKeepDims>>();
} else {
return false;
}
manager.run_passes(model);
return true;
}

static ov::Shape static_param_shape = ov::Shape{2, 19, 2, 9};
static ov::PartialShape dynamic_param_shape = ov::PartialShape{2, -1, 2, 9};

TYPED_TEST_SUITE_P(ConvertReduceNoKeepDimsTest);

TYPED_TEST_P(ConvertReduceNoKeepDimsTest, CheckConvertReduceTransformationIsAppliedForStaticShapes) {
ov::element::Type_t dataType = std::is_base_of<ov::op::util::LogicalReductionKeepDims, TypeParam>::value ?
ov::element::boolean : ov::element::f32;
auto param = std::make_shared<ov::opset1::Parameter>(dataType, static_param_shape);
auto model = createInitGraph<TypeParam>(param);
auto model_ref = createRefGraph<TypeParam>(param);

if (!registerAndRunReducePass<TypeParam>(model)) {
FAIL() << "Reduce pass is not registered.";
}

auto res = compare_functions(model, model_ref);
ASSERT_TRUE(res.first) << res.second;
}

TYPED_TEST_P(ConvertReduceNoKeepDimsTest, CheckConvertReduceTransformationIsAppliedForDynaimcShapes) {
ov::element::Type_t dataType = std::is_base_of<ov::op::util::LogicalReductionKeepDims, TypeParam>::value ?
ov::element::boolean : ov::element::f32;
auto param = std::make_shared<ov::opset1::Parameter>(dataType, dynamic_param_shape);
auto model = createInitGraph<TypeParam>(param);
auto model_ref = createRefGraph<TypeParam>(param);

if (!registerAndRunReducePass<TypeParam>(model)) {
FAIL() << "Reduce pass is not registered.";
}

auto res = compare_functions(model, model_ref);
ASSERT_TRUE(res.first) << res.second;
}

REGISTER_TYPED_TEST_SUITE_P(ConvertReduceNoKeepDimsTest,
CheckConvertReduceTransformationIsAppliedForStaticShapes,
CheckConvertReduceTransformationIsAppliedForDynaimcShapes);

using reduceTypes = ::testing::Types<ov::opset1::ReduceMin,
ov::opset1::ReduceMax,
ov::opset1::ReduceSum,
ov::opset1::ReduceProd,
ov::opset1::ReduceMean,
ov::opset1::ReduceLogicalAnd,
ov::opset1::ReduceLogicalOr>;
INSTANTIATE_TYPED_TEST_SUITE_P(ConvertReduce, ConvertReduceNoKeepDimsTest, reduceTypes);
Loading