Skip to content

Commit

Permalink
[CPU][ARM] Enable NHWC in Reduce (#23108)
Browse files Browse the repository at this point in the history
NHWC was disabled because of accuracy issue:
ARM-software/ComputeLibrary#1044

CVS-114403

---------

Co-authored-by: eshoguli <[email protected]>
  • Loading branch information
alvoron and eshoguli authored Jun 11, 2024
1 parent f5c3fc3 commit 4e6ee7f
Show file tree
Hide file tree
Showing 8 changed files with 253 additions and 18 deletions.
32 changes: 22 additions & 10 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,36 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs,

this->reduceAttrs = reduceAttrs;

auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto dstDims = dstDescs[0]->getShape().getStaticDims();
const auto& srcDims = srcDescs[0]->getShape().getStaticDims();
const auto& dstDims = dstDescs[0]->getShape().getStaticDims();
bool hasSrcNspcLayout = srcDescs[0]->hasLayoutType(LayoutType::nspc);
bool hasDstNspcLayout = dstDescs[0]->hasLayoutType(LayoutType::nspc);
auto srcShape = shapeCast(srcDims);
auto dstShape = shapeCast(dstDims);
if (hasSrcNspcLayout && hasDstNspcLayout) {
changeLayoutToNH_C({&srcShape, &dstShape});
}

TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
TensorInfo srcTensorInfo = TensorInfo(srcShape, 1,
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
TensorInfo dstTensorInfo = TensorInfo(dstShape, 1,
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));

srcTensor.allocator()->init(srcTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);

std::function<std::unique_ptr<IFunction>(void)> exec_func;
std::vector<int> castedAxes;
for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) {
int axis = axisCast(reduceAttrs.axes[i], srcDims.size(), hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION);
if (hasSrcNspcLayout && axis == -1) return false;
castedAxes.push_back(axis);
}
switch (reduceAttrs.operation) {
case Algorithm::ReduceMean: {
for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) {
auto axe = axisCast(reduceAttrs.axes[i], srcDims.size());
auto pos = axisCast(i, reduceAttrs.axes.size());
axesMean.set(pos, axe);
axesMean.set(pos, castedAxes[i]);
}
Status reduceMeanStatus = NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo);
if (!reduceMeanStatus) {
Expand All @@ -71,15 +83,15 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs,
case Algorithm::ReduceMin:
case Algorithm::ReduceSum:
case Algorithm::ReduceProd: {
Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, axisCast(reduceAttrs.axes[0], srcDims.size()),
Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, castedAxes[0],
getAclReductionOperationByAlgorithm(reduceAttrs.operation), reduceAttrs.keepDims);
if (!reductionOperationStatus) {
DEBUG_LOG("NEReductionOperation validation with indices failed: ", reductionOperationStatus.error_description());
return false;
}
exec_func = [this, srcDims]() -> std::unique_ptr<IFunction> {
exec_func = [this, castedAxes]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<arm_compute::NEReductionOperation>();
acl_op->configure(&srcTensor, &dstTensor, axisCast(this->reduceAttrs.axes[0], srcDims.size()),
acl_op->configure(&srcTensor, &dstTensor, castedAxes[0],
getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), this->reduceAttrs.keepDims);
return acl_op;
};
Expand All @@ -103,4 +115,4 @@ void AclReduceExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vect
}

} // namespace intel_cpu
} // namespace ov
} // namespace ov
29 changes: 27 additions & 2 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,33 @@ inline arm_compute::TensorShape shapeCast(const VectorDims& dims) {
return tensorShape;
}

inline std::size_t axisCast(const std::size_t axis, const std::size_t shapeSize) {
return shapeSize - axis - 1;
enum ACLAxisCastMode {
NO_LAYOUT_CONVERSION,
NHWC_TO_NCHW,
NCHW_TO_NHWC
};

/**
* @brief Return reverted axis used in ACL. If axis cast mode is
* @param axis axis that needs to be converted
* @param shapeSize size of the shape, which axis needs to be converted
* @param axisCastMode specifies whether layout conversion is required or not
* @return reverted axis
*/
inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxisCastMode axisCastMode = NO_LAYOUT_CONVERSION) {
// CWHN (reverted NHWC) (0, 1, 2, 3) into WHCN (reverted NCHW) (1, 2, 0, 3)
static std::vector<size_t> nhwcToNchw = {1, 2, 0, 3};
// WHCN (reverted NCHW) (0, 1, 2, 3) into CWHN (reverted NHWC) (2, 0, 1, 3)
static std::vector<size_t> nchwToNhwc = {2, 0, 1, 3};
size_t revertedAxis = shapeSize - axis - 1;
switch (axisCastMode) {
case NHWC_TO_NCHW:
return revertedAxis > 3 ? -1 : nhwcToNchw[revertedAxis];
case NCHW_TO_NHWC:
return revertedAxis > 3 ? -1 : nchwToNhwc[revertedAxis];
default:
return revertedAxis;
}
}

inline Dim vectorProduct(const VectorDims& vec, size_t size) {
Expand Down
3 changes: 1 addition & 2 deletions src/plugins/intel_cpu/src/nodes/reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1992,8 +1992,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {
if (axis < 0)
axis += static_cast<int>(getInputShapeAtPort(REDUCE_DATA).getRank());
}
// TODO: Per-channel layout is disabled due to accuracy issue in ACL Reduce Executor
// pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, undef, true);
pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, impl_desc_type::undef, true);
pushDesc(LayoutType::ncsp, LayoutType::ncsp, input_prec, output_prec, impl_desc_type::undef, true);
canUseAclExecutor = !supportedPrimitiveDescriptors.empty();
if (canUseAclExecutor)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (C) 2020-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0


#include "convert_reduce_no_keep_dims.hpp"

#include "openvino/core/rt_info.hpp"
#include "openvino/opsets/opset8.hpp"

template <class T>
ov::matcher_pass_callback ov::intel_cpu::ConvertReduceNoKeepDimsBase::convert_reduce() {
return [&](ov::pass::pattern::Matcher& m) {
auto reduce = std::dynamic_pointer_cast<T>(m.get_match_root());
if (!reduce || reduce->get_keep_dims()) {
return false;
}

reduce->set_keep_dims(true);
const auto reduce_new = reduce->clone_with_new_inputs({reduce->input_value(0), reduce->input_value(1)});
std::shared_ptr<ov::Node> squeeze = std::make_shared<ov::op::v0::Squeeze>(reduce_new, reduce->input_value(1));
squeeze->set_friendly_name(reduce_new->get_friendly_name());
ov::copy_runtime_info(reduce, {reduce_new, squeeze});
ov::replace_node(reduce, squeeze);

return true;
};
}

template <typename ReductionType>
ov::intel_cpu::ConvertReduction<ReductionType>::ConvertReduction() {
auto m = std::make_shared<ov::pass::pattern::Matcher>(
ov::pass::pattern::wrap_type<ReductionType>({ov::pass::pattern::any_input(),
ov::pass::pattern::wrap_type<ov::opset8::Constant>()}), "ConvertReduction");
register_matcher(m, convert_reduce<ReductionType>());
}

template class ov::intel_cpu::ConvertReduction<ov::op::util::LogicalReductionKeepDims>;
template class ov::intel_cpu::ConvertReduction<ov::op::util::ArithmeticReductionKeepDims>;
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright (C) 2020-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "openvino/pass/graph_rewrite.hpp"
#include "openvino/op/util/arithmetic_reductions_keep_dims.hpp"
#include "openvino/op/util/logical_reduction_keep_dims.hpp"

/*
* Description:
* ConvertReduceNoKeepDimsBase detects Reduce operations with keepDims = false.
* Such Reduce operation is replaced with Reduce operation with keepDims = true and Squeeze
* which removes undesired dimensions.
*
* Before:
*
* +--------------+ +-----------------+
* | Data | | Axes tensor |
* +-----------+--+ +-+---------------+
* | |
* +---------------------------+
* | Reduce (keepDims = false) |
* +---------------------------+
*
* After:
*
* +--------------+ +-----------------+
* | Data | | Axes tensor |
* +-----------+--+ +-+------------+--+
* | | |
* +---------------------------+ |
* | Reduce (keepDims = true) | |
* +-----------------------+---+ |
* | |
* +--------v------v-+
* | Squeeze |
* +-----------------+
*
*/

namespace ov {
namespace intel_cpu {

class ConvertReduceNoKeepDimsBase: public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("ConvertReduceNoKeepDims", "0");
template <class T>
ov::matcher_pass_callback convert_reduce();
};

template <typename ReductionType>
class ConvertReduction: public ConvertReduceNoKeepDimsBase {
public:
OPENVINO_RTTI("ConvertReduction", "0");
ConvertReduction();
};


class ConvertReduceNoKeepDims: public ov::pass::GraphRewrite {
public:
OPENVINO_RTTI("ConvertReduceNoKeepDims", "0");
ConvertReduceNoKeepDims() {
add_matcher<ConvertReduction<ov::op::util::LogicalReductionKeepDims>>();
add_matcher<ConvertReduction<ov::op::util::ArithmeticReductionKeepDims>>();
}
};

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@
#include "transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp"
#include "transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp"
#include "transformations/cpu_opset/arm/pass/mish_decomposition.hpp"
#include "transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp"
#include "transformations/cpu_opset/common/pass/decompose_integer_divide.hpp"
#include "transformations/cpu_opset/common/pass/convert_fq_rnn_to_quantized_rnn.hpp"
#include "transformations/cpu_opset/common/pass/insert_convert_after_extension.hpp"
Expand Down Expand Up @@ -432,6 +433,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
CPU_REGISTER_PASS_COMMON(manager, SwapConvertTranspose);
CPU_REGISTER_PASS_X64(manager, ConvertToInteraction);
CPU_REGISTER_PASS_X64(manager, ConvertInteractionInt8);
CPU_REGISTER_PASS_ARM(manager, ConvertReduceNoKeepDims);
CPU_REGISTER_PASS_ARM(manager, ConvertReduceMultiAxis);
CPU_REGISTER_PASS_ARM32(manager, MishDecomposition);
CPU_REGISTER_PASS_ARM(manager, ConvertConv1D);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,7 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_SingleBatch = {

std::vector<CPUSpecificParams> cpuParams_4D = {
CPUSpecificParams({nchw}, {nchw}, {}, {}),
//NHWC layout is disabled on ARM due to accuracy issue: https://github.com/ARM-software/ComputeLibrary/issues/1044
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
CPUSpecificParams({nhwc}, {nhwc}, {}, {}),
#endif
};

/* ================================ 1.1 No fusion - Arithmetic ================================ */
Expand Down Expand Up @@ -160,4 +157,4 @@ INSTANTIATE_TEST_SUITE_P(

} // namespace Reduce
} // namespace test
} // namespace ov
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <gtest/gtest.h>

#include <openvino/opsets/opset1.hpp>
#include <transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp>
#include "common_test_utils/ov_test_utils.hpp"

using namespace ov::intel_cpu;

template <class T>
class ConvertReduceNoKeepDimsTest : public testing::Test {};

template <class T>
static std::shared_ptr<ov::Model> createInitGraph(std::shared_ptr<ov::opset1::Parameter> param) {
auto axes = ov::opset1::Constant::create(ov::element::i64, ov::Shape{2}, {0, 1});
auto reduce = std::make_shared<T>(param, axes, false);
return std::make_shared<ov::Model>(ov::NodeVector{ reduce }, ov::ParameterVector{ param });
}

template <class T>
static std::shared_ptr<ov::Model> createRefGraph(std::shared_ptr<ov::opset1::Parameter> param) {
auto axes = ov::opset1::Constant::create(ov::element::i64, ov::Shape{2}, {0, 1});
auto reduce = std::make_shared<T>(param, axes, true);
auto squeeze = std::make_shared<ov::opset1::Squeeze>(reduce, axes);
return std::make_shared<ov::Model>(ov::NodeVector{ squeeze }, ov::ParameterVector{ param });
}

template <class T>
static bool registerAndRunReducePass(std::shared_ptr<ov::Model> model) {
ov::pass::Manager manager;
if (std::is_base_of<ov::op::util::LogicalReductionKeepDims, T>::value) {
manager.register_pass<ConvertReduction<ov::op::util::LogicalReductionKeepDims>>();
} else if (std::is_base_of<ov::op::util::ArithmeticReductionKeepDims, T>::value) {
manager.register_pass<ConvertReduction<ov::op::util::ArithmeticReductionKeepDims>>();
} else {
return false;
}
manager.run_passes(model);
return true;
}

static ov::Shape static_param_shape = ov::Shape{2, 19, 2, 9};
static ov::PartialShape dynamic_param_shape = ov::PartialShape{2, -1, 2, 9};

TYPED_TEST_SUITE_P(ConvertReduceNoKeepDimsTest);

TYPED_TEST_P(ConvertReduceNoKeepDimsTest, CheckConvertReduceTransformationIsAppliedForStaticShapes) {
ov::element::Type_t dataType = std::is_base_of<ov::op::util::LogicalReductionKeepDims, TypeParam>::value ?
ov::element::boolean : ov::element::f32;
auto param = std::make_shared<ov::opset1::Parameter>(dataType, static_param_shape);
auto model = createInitGraph<TypeParam>(param);
auto model_ref = createRefGraph<TypeParam>(param);

if (!registerAndRunReducePass<TypeParam>(model)) {
FAIL() << "Reduce pass is not registered.";
}

auto res = compare_functions(model, model_ref);
ASSERT_TRUE(res.first) << res.second;
}

TYPED_TEST_P(ConvertReduceNoKeepDimsTest, CheckConvertReduceTransformationIsAppliedForDynaimcShapes) {
ov::element::Type_t dataType = std::is_base_of<ov::op::util::LogicalReductionKeepDims, TypeParam>::value ?
ov::element::boolean : ov::element::f32;
auto param = std::make_shared<ov::opset1::Parameter>(dataType, dynamic_param_shape);
auto model = createInitGraph<TypeParam>(param);
auto model_ref = createRefGraph<TypeParam>(param);

if (!registerAndRunReducePass<TypeParam>(model)) {
FAIL() << "Reduce pass is not registered.";
}

auto res = compare_functions(model, model_ref);
ASSERT_TRUE(res.first) << res.second;
}

REGISTER_TYPED_TEST_SUITE_P(ConvertReduceNoKeepDimsTest,
CheckConvertReduceTransformationIsAppliedForStaticShapes,
CheckConvertReduceTransformationIsAppliedForDynaimcShapes);

using reduceTypes = ::testing::Types<ov::opset1::ReduceMin,
ov::opset1::ReduceMax,
ov::opset1::ReduceSum,
ov::opset1::ReduceProd,
ov::opset1::ReduceMean,
ov::opset1::ReduceLogicalAnd,
ov::opset1::ReduceLogicalOr>;
INSTANTIATE_TYPED_TEST_SUITE_P(ConvertReduce, ConvertReduceNoKeepDimsTest, reduceTypes);

0 comments on commit 4e6ee7f

Please sign in to comment.