From 4e6ee7f15e1991aefb2ae1a1b6a545456c8844b2 Mon Sep 17 00:00:00 2001 From: Aleksandr Voron Date: Tue, 11 Jun 2024 18:21:23 +0200 Subject: [PATCH] [CPU][ARM] Enable NHWC in Reduce (#23108) NHWC was disabled because of accuracy issue: https://github.com/ARM-software/ComputeLibrary/issues/1044 CVS-114403 --------- Co-authored-by: eshoguli --- .../src/nodes/executors/acl/acl_reduce.cpp | 32 +++++-- .../src/nodes/executors/acl/acl_utils.hpp | 29 +++++- src/plugins/intel_cpu/src/nodes/reduce.cpp | 3 +- .../arm/pass/convert_reduce_no_keep_dims.cpp | 38 ++++++++ .../arm/pass/convert_reduce_no_keep_dims.hpp | 71 +++++++++++++++ .../transformation_pipeline.cpp | 2 + .../instances/common/reduce.cpp | 5 +- .../arm/convert_reduce_no_keep_dims.cpp | 91 +++++++++++++++++++ 8 files changed, 253 insertions(+), 18 deletions(-) create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp create mode 100644 src/plugins/intel_cpu/tests/unit/transformations/arm/convert_reduce_no_keep_dims.cpp diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp index 339ea8aff59af1..e99747121cb623 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp @@ -36,24 +36,36 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs, this->reduceAttrs = reduceAttrs; - auto srcDims = srcDescs[0]->getShape().getStaticDims(); - auto dstDims = dstDescs[0]->getShape().getStaticDims(); + const auto& srcDims = srcDescs[0]->getShape().getStaticDims(); + const auto& dstDims = dstDescs[0]->getShape().getStaticDims(); + bool hasSrcNspcLayout = srcDescs[0]->hasLayoutType(LayoutType::nspc); + bool hasDstNspcLayout = dstDescs[0]->hasLayoutType(LayoutType::nspc); + auto srcShape = shapeCast(srcDims); + auto dstShape = shapeCast(dstDims); + if (hasSrcNspcLayout && hasDstNspcLayout) { + changeLayoutToNH_C({&srcShape, &dstShape}); + } - TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1, + TensorInfo srcTensorInfo = TensorInfo(srcShape, 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1, + TensorInfo dstTensorInfo = TensorInfo(dstShape, 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); srcTensor.allocator()->init(srcTensorInfo); dstTensor.allocator()->init(dstTensorInfo); std::function(void)> exec_func; + std::vector castedAxes; + for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { + int axis = axisCast(reduceAttrs.axes[i], srcDims.size(), hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION); + if (hasSrcNspcLayout && axis == -1) return false; + castedAxes.push_back(axis); + } switch (reduceAttrs.operation) { case Algorithm::ReduceMean: { for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { - auto axe = axisCast(reduceAttrs.axes[i], srcDims.size()); auto pos = axisCast(i, reduceAttrs.axes.size()); - axesMean.set(pos, axe); + axesMean.set(pos, castedAxes[i]); } Status reduceMeanStatus = NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo); if (!reduceMeanStatus) { @@ -71,15 +83,15 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs, case Algorithm::ReduceMin: case Algorithm::ReduceSum: case Algorithm::ReduceProd: { - Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, axisCast(reduceAttrs.axes[0], srcDims.size()), + Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, castedAxes[0], getAclReductionOperationByAlgorithm(reduceAttrs.operation), reduceAttrs.keepDims); if (!reductionOperationStatus) { DEBUG_LOG("NEReductionOperation validation with indices failed: ", reductionOperationStatus.error_description()); return false; } - exec_func = [this, srcDims]() -> std::unique_ptr { + exec_func = [this, castedAxes]() -> std::unique_ptr { auto acl_op = std::make_unique(); - acl_op->configure(&srcTensor, &dstTensor, axisCast(this->reduceAttrs.axes[0], srcDims.size()), + acl_op->configure(&srcTensor, &dstTensor, castedAxes[0], getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), this->reduceAttrs.keepDims); return acl_op; }; @@ -103,4 +115,4 @@ void AclReduceExecutor::exec(const std::vector& src, const std::vect } } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp index 32a2494d15f01c..b3077d4c16e342 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp @@ -67,8 +67,33 @@ inline arm_compute::TensorShape shapeCast(const VectorDims& dims) { return tensorShape; } -inline std::size_t axisCast(const std::size_t axis, const std::size_t shapeSize) { - return shapeSize - axis - 1; +enum ACLAxisCastMode { + NO_LAYOUT_CONVERSION, + NHWC_TO_NCHW, + NCHW_TO_NHWC +}; + +/** +* @brief Return reverted axis used in ACL. If axis cast mode is +* @param axis axis that needs to be converted +* @param shapeSize size of the shape, which axis needs to be converted +* @param axisCastMode specifies whether layout conversion is required or not +* @return reverted axis +*/ +inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxisCastMode axisCastMode = NO_LAYOUT_CONVERSION) { + // CWHN (reverted NHWC) (0, 1, 2, 3) into WHCN (reverted NCHW) (1, 2, 0, 3) + static std::vector nhwcToNchw = {1, 2, 0, 3}; + // WHCN (reverted NCHW) (0, 1, 2, 3) into CWHN (reverted NHWC) (2, 0, 1, 3) + static std::vector nchwToNhwc = {2, 0, 1, 3}; + size_t revertedAxis = shapeSize - axis - 1; + switch (axisCastMode) { + case NHWC_TO_NCHW: + return revertedAxis > 3 ? -1 : nhwcToNchw[revertedAxis]; + case NCHW_TO_NHWC: + return revertedAxis > 3 ? -1 : nchwToNhwc[revertedAxis]; + default: + return revertedAxis; + } } inline Dim vectorProduct(const VectorDims& vec, size_t size) { diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index e3be52f7ae7d12..e09062a3f58d6a 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -1992,8 +1992,7 @@ void Reduce::initSupportedPrimitiveDescriptors() { if (axis < 0) axis += static_cast(getInputShapeAtPort(REDUCE_DATA).getRank()); } - // TODO: Per-channel layout is disabled due to accuracy issue in ACL Reduce Executor - // pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, undef, true); + pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, impl_desc_type::undef, true); pushDesc(LayoutType::ncsp, LayoutType::ncsp, input_prec, output_prec, impl_desc_type::undef, true); canUseAclExecutor = !supportedPrimitiveDescriptors.empty(); if (canUseAclExecutor) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.cpp new file mode 100644 index 00000000000000..def7250dd5b938 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + + +#include "convert_reduce_no_keep_dims.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/opsets/opset8.hpp" + +template +ov::matcher_pass_callback ov::intel_cpu::ConvertReduceNoKeepDimsBase::convert_reduce() { + return [&](ov::pass::pattern::Matcher& m) { + auto reduce = std::dynamic_pointer_cast(m.get_match_root()); + if (!reduce || reduce->get_keep_dims()) { + return false; + } + + reduce->set_keep_dims(true); + const auto reduce_new = reduce->clone_with_new_inputs({reduce->input_value(0), reduce->input_value(1)}); + std::shared_ptr squeeze = std::make_shared(reduce_new, reduce->input_value(1)); + squeeze->set_friendly_name(reduce_new->get_friendly_name()); + ov::copy_runtime_info(reduce, {reduce_new, squeeze}); + ov::replace_node(reduce, squeeze); + + return true; + }; +} + +template +ov::intel_cpu::ConvertReduction::ConvertReduction() { + auto m = std::make_shared( + ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(), + ov::pass::pattern::wrap_type()}), "ConvertReduction"); + register_matcher(m, convert_reduce()); +} + +template class ov::intel_cpu::ConvertReduction; +template class ov::intel_cpu::ConvertReduction; diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp new file mode 100644 index 00000000000000..2f3c7d19726513 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/op/util/arithmetic_reductions_keep_dims.hpp" +#include "openvino/op/util/logical_reduction_keep_dims.hpp" + +/* + * Description: + * ConvertReduceNoKeepDimsBase detects Reduce operations with keepDims = false. + * Such Reduce operation is replaced with Reduce operation with keepDims = true and Squeeze + * which removes undesired dimensions. + * + * Before: + * + * +--------------+ +-----------------+ + * | Data | | Axes tensor | + * +-----------+--+ +-+---------------+ + * | | + * +---------------------------+ + * | Reduce (keepDims = false) | + * +---------------------------+ + * + * After: + * + * +--------------+ +-----------------+ + * | Data | | Axes tensor | + * +-----------+--+ +-+------------+--+ + * | | | + * +---------------------------+ | + * | Reduce (keepDims = true) | | + * +-----------------------+---+ | + * | | + * +--------v------v-+ + * | Squeeze | + * +-----------------+ + * + */ + +namespace ov { +namespace intel_cpu { + +class ConvertReduceNoKeepDimsBase: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ConvertReduceNoKeepDims", "0"); + template + ov::matcher_pass_callback convert_reduce(); +}; + +template +class ConvertReduction: public ConvertReduceNoKeepDimsBase { +public: + OPENVINO_RTTI("ConvertReduction", "0"); + ConvertReduction(); +}; + + +class ConvertReduceNoKeepDims: public ov::pass::GraphRewrite { +public: + OPENVINO_RTTI("ConvertReduceNoKeepDims", "0"); + ConvertReduceNoKeepDims() { + add_matcher>(); + add_matcher>(); + } +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 0da2f8fe0cb1c9..f84b0ba70b0e54 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -122,6 +122,7 @@ #include "transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp" #include "transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp" #include "transformations/cpu_opset/arm/pass/mish_decomposition.hpp" +#include "transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp" #include "transformations/cpu_opset/common/pass/decompose_integer_divide.hpp" #include "transformations/cpu_opset/common/pass/convert_fq_rnn_to_quantized_rnn.hpp" #include "transformations/cpu_opset/common/pass/insert_convert_after_extension.hpp" @@ -432,6 +433,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_REGISTER_PASS_COMMON(manager, SwapConvertTranspose); CPU_REGISTER_PASS_X64(manager, ConvertToInteraction); CPU_REGISTER_PASS_X64(manager, ConvertInteractionInt8); + CPU_REGISTER_PASS_ARM(manager, ConvertReduceNoKeepDims); CPU_REGISTER_PASS_ARM(manager, ConvertReduceMultiAxis); CPU_REGISTER_PASS_ARM32(manager, MishDecomposition); CPU_REGISTER_PASS_ARM(manager, ConvertConv1D); diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/reduce.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/reduce.cpp index cd2f14949f3ac8..c57a4cc27c97af 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/reduce.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/reduce.cpp @@ -46,10 +46,7 @@ std::vector> inputShapes_SingleBatch = { std::vector cpuParams_4D = { CPUSpecificParams({nchw}, {nchw}, {}, {}), -//NHWC layout is disabled on ARM due to accuracy issue: https://github.com/ARM-software/ComputeLibrary/issues/1044 -#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) CPUSpecificParams({nhwc}, {nhwc}, {}, {}), -#endif }; /* ================================ 1.1 No fusion - Arithmetic ================================ */ @@ -160,4 +157,4 @@ INSTANTIATE_TEST_SUITE_P( } // namespace Reduce } // namespace test -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/unit/transformations/arm/convert_reduce_no_keep_dims.cpp b/src/plugins/intel_cpu/tests/unit/transformations/arm/convert_reduce_no_keep_dims.cpp new file mode 100644 index 00000000000000..2881fd68a820f4 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/transformations/arm/convert_reduce_no_keep_dims.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include "common_test_utils/ov_test_utils.hpp" + +using namespace ov::intel_cpu; + +template +class ConvertReduceNoKeepDimsTest : public testing::Test {}; + +template +static std::shared_ptr createInitGraph(std::shared_ptr param) { + auto axes = ov::opset1::Constant::create(ov::element::i64, ov::Shape{2}, {0, 1}); + auto reduce = std::make_shared(param, axes, false); + return std::make_shared(ov::NodeVector{ reduce }, ov::ParameterVector{ param }); +} + +template +static std::shared_ptr createRefGraph(std::shared_ptr param) { + auto axes = ov::opset1::Constant::create(ov::element::i64, ov::Shape{2}, {0, 1}); + auto reduce = std::make_shared(param, axes, true); + auto squeeze = std::make_shared(reduce, axes); + return std::make_shared(ov::NodeVector{ squeeze }, ov::ParameterVector{ param }); +} + +template +static bool registerAndRunReducePass(std::shared_ptr model) { + ov::pass::Manager manager; + if (std::is_base_of::value) { + manager.register_pass>(); + } else if (std::is_base_of::value) { + manager.register_pass>(); + } else { + return false; + } + manager.run_passes(model); + return true; +} + +static ov::Shape static_param_shape = ov::Shape{2, 19, 2, 9}; +static ov::PartialShape dynamic_param_shape = ov::PartialShape{2, -1, 2, 9}; + +TYPED_TEST_SUITE_P(ConvertReduceNoKeepDimsTest); + +TYPED_TEST_P(ConvertReduceNoKeepDimsTest, CheckConvertReduceTransformationIsAppliedForStaticShapes) { + ov::element::Type_t dataType = std::is_base_of::value ? + ov::element::boolean : ov::element::f32; + auto param = std::make_shared(dataType, static_param_shape); + auto model = createInitGraph(param); + auto model_ref = createRefGraph(param); + + if (!registerAndRunReducePass(model)) { + FAIL() << "Reduce pass is not registered."; + } + + auto res = compare_functions(model, model_ref); + ASSERT_TRUE(res.first) << res.second; +} + +TYPED_TEST_P(ConvertReduceNoKeepDimsTest, CheckConvertReduceTransformationIsAppliedForDynaimcShapes) { + ov::element::Type_t dataType = std::is_base_of::value ? + ov::element::boolean : ov::element::f32; + auto param = std::make_shared(dataType, dynamic_param_shape); + auto model = createInitGraph(param); + auto model_ref = createRefGraph(param); + + if (!registerAndRunReducePass(model)) { + FAIL() << "Reduce pass is not registered."; + } + + auto res = compare_functions(model, model_ref); + ASSERT_TRUE(res.first) << res.second; +} + +REGISTER_TYPED_TEST_SUITE_P(ConvertReduceNoKeepDimsTest, + CheckConvertReduceTransformationIsAppliedForStaticShapes, + CheckConvertReduceTransformationIsAppliedForDynaimcShapes); + +using reduceTypes = ::testing::Types; +INSTANTIATE_TYPED_TEST_SUITE_P(ConvertReduce, ConvertReduceNoKeepDimsTest, reduceTypes);