diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index 99acb9070550dc..723d3d76601b4c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -15,146 +15,6 @@ namespace ov { namespace intel_cpu { -static VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape) { - const auto& weightDims = wShape.getStaticDims(); - - auto inMinDims = inShape.getMinDims(); - auto inMaxDims = inShape.getMaxDims(); - inMinDims.back() = weightDims.back(); - inMaxDims.back() = weightDims.back(); - - return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims(); -} - -static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) { - size_t activationRank = inShape.size(); - size_t channelRank = wShape.size() - 1; - // activation weight output_shape - // NCHW CoCHW NCo - // TNC CoC TNCo - // NC CoC NCo - VectorDims outputShape(out_rank, 1); - // set Co - outputShape.back() = wShape[0]; - // set batch dims - size_t batchRank = activationRank - channelRank; - size_t startIdx = out_rank - batchRank - 1; - for (size_t i = 0; i < batchRank; i++) { - outputShape[i + startIdx] = inShape[i]; - } - - return outputShape; -} - -static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, - const ExecutorContext::CPtr context, - const FCAttrs &attrs, - const ACLFCAttrs& aclfcAttrs, - const PostOps &postOps) { - DEBUG_LOG("ACLFullyConnectedExecutor: prepack weights"); - const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); - const auto N = wgtDims[0]; - const auto K = wgtDims[1]; - - auto create = [&]() { - MemoryPtr final_ptr = memory.at(ARG_WEI); - // Convert weights precision - if (aclfcAttrs.isConvertedWeights) { - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = memory.at(ARG_WEI); - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - memoryArgs[ARG_SRC_0]->getDescPtr()->cloneWithNewPrecision( - aclfcAttrs.inputPrecision)); - auto aclWeightsConverter = std::make_shared(); - if (aclWeightsConverter->update(memoryArgs)) { - aclWeightsConverter->execute(memoryArgs); - } else { - auto count_wei_elem = std::accumulate(memoryArgs[ARG_SRC_0]->getStaticDims().begin(), - memoryArgs[ARG_SRC_0]->getStaticDims().end(), - 1, - std::multiplies<>()); - cpu_convert(memoryArgs[ARG_SRC_0]->getData(), - memoryArgs[ARG_DST]->getData(), - memoryArgs[ARG_SRC_0]->getPrecision(), - memoryArgs[ARG_DST]->getPrecision(), - count_wei_elem); - } - final_ptr = memoryArgs[ARG_DST]; - } - // Packed weights - { - arm_compute::WeightFormat expectedWeightFormat; - bool isNeededReorder; - { - MemoryArgs memoryArgs; - memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); - memoryArgs[ARG_WEI] = final_ptr; - if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { - const auto& inShape = memory.at(ARG_SRC_0)->getShape(); - const auto& wShape = final_ptr->getShape(); - const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); - const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); - memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), - memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims)); - } else { - memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); - memoryArgs[ARG_DST] = memory.at(ARG_DST); - } - auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); - isNeededReorder = aclWeightsRepack->update(memoryArgs); - expectedWeightFormat = aclWeightsRepack->getOptImplWeightFormat(); - } - if (isNeededReorder) { - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = final_ptr; - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - memoryArgs[ARG_SRC_0]->getDescPtr()->clone()); - auto aclWeightsReorder = std::make_shared( - arm_compute::WeightFormat::OHWI, expectedWeightFormat); - if (aclWeightsReorder->update(memoryArgs)) { - aclWeightsReorder->execute(memoryArgs); - final_ptr = memoryArgs[ARG_DST]; - } - } - } - // Transpose weights - if (!aclfcAttrs.weightsNonTransposed) { - auto reverse_weights_dims = memory.at(ARG_WEI)->getStaticDims(); - if (reverse_weights_dims.size() == 3) { - reverse_weights_dims = VectorDims( - {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]}); - } - std::reverse(reverse_weights_dims.begin(), reverse_weights_dims.end()); - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = final_ptr; - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - CpuBlockedMemoryDesc(final_ptr->getPrecision(), - intel_cpu::Shape(reverse_weights_dims))); - auto aclWeightsTranspose = std::make_shared(); - if (aclWeightsTranspose->update(memoryArgs)) { - aclWeightsTranspose->execute(memoryArgs); - final_ptr = memoryArgs[ARG_DST]; - } - } - DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing"); - return final_ptr; - }; - - auto weightCache = context->getWeightsCache(); - if (weightCache != nullptr) { - std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K); - const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" + - std::to_string(reinterpret_cast(memory.at(ARG_WEI)->getData())); - DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash); - return *weightCache->findOrCreate(string_hash, create); - } - - DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available"); - return create(); -} - static bool checkPostOps(const PostOps &postOps) { // Add postops if (!postOps.empty() && postOps.size() == 1) { @@ -196,7 +56,7 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const MemoryArgs &memory, const ExecutorContext::CPtr context) { initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); - packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps); + packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps); } bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { @@ -212,30 +72,8 @@ bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { return true; } -static void updateFCTensorsShapes(ACLShapes& aclMemoryShapes) { - if (aclMemoryShapes[ACLArgs::ACL_WEI].num_dimensions() == 3U) { - aclMemoryShapes[ACLArgs::ACL_WEI] = arm_compute::TensorShape( - {aclMemoryShapes[ACLArgs::ACL_WEI][0] * aclMemoryShapes[ACLArgs::ACL_WEI][1], - aclMemoryShapes[ACLArgs::ACL_WEI][2]}); - } - - if (one_of(aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions(), 3U, 4U)) { - aclMemoryShapes[ACLArgs::ACL_SRC_0] = arm_compute::TensorShape({ - aclMemoryShapes[ACLArgs::ACL_WEI][0], - aclMemoryShapes[ACLArgs::ACL_SRC_0].total_size() / aclMemoryShapes[ACLArgs::ACL_WEI][0]}); - } - - if (one_of(aclMemoryShapes[ACLArgs::ACL_DST].num_dimensions(), 3U, 4U)) { - aclMemoryShapes[ACLArgs::ACL_DST] = arm_compute::TensorShape({ - aclMemoryShapes[ACLArgs::ACL_WEI][1], - aclMemoryShapes[ACLArgs::ACL_SRC_0][1]}); - } - - std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]); -} - void ACLFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) { - updateFCTensorsShapes(aclMemoryShapes); + acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes); } arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLInfos & aclMemoryInfos) { @@ -268,88 +106,5 @@ ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors & aclM return neFC; } -arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { - return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - arm_compute::ConvertPolicy::SATURATE); -} - -ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors &aclMemoryTensors) { - auto neCast = std::make_unique(); - neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get(), - arm_compute::ConvertPolicy::SATURATE); - return neCast; -} - - -arm_compute::Status acl_fc_executor::ACLWeightsTranspose::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { - return arm_compute::NETranspose::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get()); -} - -ACLFunction acl_fc_executor::ACLWeightsTranspose::configureFunction(const ACLTensors &aclMemoryTensors) { - auto neTranspose = std::make_unique(); - neTranspose->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get()); - return neTranspose; -} - -acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs, - const PostOps &postOps, - const MemoryArgs &memory) { - initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); -} - -void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes &aclMemoryShapes) { - updateFCTensorsShapes(aclMemoryShapes); -} - -arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { - if (aclfcAttrs.isConvertedWeights) { - aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); - } - return arm_compute::NEFullyConnectedLayer::has_opt_impl( - expectedWeightFormat, - aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_WEI].get(), - aclMemoryInfos[ACLArgs::ACL_BIAS].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - fullyConnectedLayerInfo, - weightsInfo); -} - -ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors &aclMemoryTensors) { - return std::make_unique(); -} - -arm_compute::Status acl_fc_executor::ACLWeightsReorder::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { -#if defined(OPENVINO_ARCH_ARM64) - return arm_compute::NEReorderLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - inWeightFormat, - outWeightFormat); -#else - return arm_compute::NECopy::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get()); -#endif -} - -ACLFunction acl_fc_executor::ACLWeightsReorder::configureFunction(const ACLTensors &aclMemoryTensors) { -#if defined(OPENVINO_ARCH_ARM64) - auto neReorderLayer = std::make_unique(); - neReorderLayer->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get(), - inWeightFormat, - outWeightFormat); - return neReorderLayer; -#else - auto neCopy = std::make_unique(); - neCopy->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get()); - return neCopy; -#endif -} - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp index 4d7f2e5ef91480..4fb4703ba2742b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp @@ -6,67 +6,11 @@ #include "acl_common_executor.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "acl_weights.hpp" namespace ov { namespace intel_cpu { -struct ACLFCAttrs { - ov::element::Type inputPrecision; - bool isConvertedWeights = false; - bool weightsNonTransposed; -}; - -namespace acl_fc_executor { - -class ACLWeightsConverter : public ACLCommonExecutor { -public: - ACLWeightsConverter() = default; - void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; -}; - -class ACLWeightsTranspose : public ACLCommonExecutor { -public: - ACLWeightsTranspose() = default; - void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; -}; - -class ACLWeightFormatGenerator : public ACLCommonExecutor { -public: - ACLWeightFormatGenerator(const FCAttrs& attrs, - const PostOps& postOps, - const MemoryArgs& memory); - void updateTensorsShapes(ACLShapes& aclMemoryShapes) override; - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; - arm_compute::WeightFormat getOptImplWeightFormat() { - return expectedWeightFormat; - } -private: - arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; - arm_compute::WeightsInfo weightsInfo; - ACLFCAttrs aclfcAttrs; - arm_compute::WeightFormat expectedWeightFormat; -}; - -class ACLWeightsReorder : public ACLCommonExecutor { -public: - ACLWeightsReorder(arm_compute::WeightFormat inWeightFormat, - arm_compute::WeightFormat outWeightFormat) - : inWeightFormat(inWeightFormat), outWeightFormat(outWeightFormat) {} - void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; -private: - arm_compute::WeightFormat inWeightFormat; - arm_compute::WeightFormat outWeightFormat; -}; - -} // namespace acl_fc_executor - class ACLFullyConnectedExecutor : public ACLCommonExecutor { public: ACLFullyConnectedExecutor(const FCAttrs& attrs, diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp index 72b45bcabebc20..7ca2b4b5dbaf36 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp @@ -15,46 +15,42 @@ #include "acl_weights.hpp" #include "acl_utils.hpp" +#include "nodes/common/cpu_convert.h" +#include "memory_desc/cpu_memory_desc_utils.h" + namespace ov { namespace intel_cpu { -bool checkAndInitPostOps(const PostOps &postOps, arm_compute::GEMMInfo& info) { +static bool checkPostOps(const PostOps &postOps) { // Add postops if (!postOps.empty() && postOps.size() == 1) { if (const auto activation = std::dynamic_pointer_cast(postOps[0])) { - auto activation_info = info.activation_info(); - auto result = getActivationLayerInfo(convertToEltwiseAlgorithm( - activation->type()), - activation_info, - activation->alpha(), - activation->beta(), - activation->gamma()); - if (result) { - info.set_activation_info(activation_info); + if (checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()))) { + return true; } - return result; } } return false; } -void initFCAttrs(const FCAttrs &attrs, - ACLTensorAttrs& aclTensorAttrs, - ACLFCAttrs& aclfcAttrs, - const MemoryArgs &memory, - arm_compute::GEMMInfo& gemmInfo, - const PostOps &postOps) { +static void initFCAttrs(const FCAttrs &attrs, + ACLTensorAttrs& aclTensorAttrs, + ACLFCAttrs& aclfcAttrs, + const MemoryArgs &memory, + arm_compute::GEMMInfo& fullyConnectedLayerInfo, + const PostOps &postOps) { aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); - // TODO: not completed //fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision(); - // TODO: not completed //fullyConnectedLayerInfo.transpose_weights = false; - gemmInfo.set_pretranspose_A(false); - gemmInfo.set_pretranspose_B(false); aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed; - checkAndInitPostOps(postOps, gemmInfo); + if (checkPostOps(postOps)) { + auto activation = std::dynamic_pointer_cast(postOps[0]); + fullyConnectedLayerInfo.set_activation_info(getActivationLayerInfo( + convertToEltwiseAlgorithm(activation->type()), + activation->alpha(), activation->beta(), activation->gamma())); + } if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) { aclfcAttrs.isConvertedWeights = true; @@ -66,7 +62,7 @@ ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs &attr const MemoryArgs &memory, const ExecutorContext::CPtr& context) : dequantizationScales(attrs.dequantizationScales) { initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps); - packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps); + packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps); } bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) { @@ -86,7 +82,7 @@ bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) { } void ACLLowpFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) { - updateFCTensorsShapes(aclMemoryShapes); + acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes); } arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACLInfos & aclMemoryInfos) { diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp new file mode 100644 index 00000000000000..819be7ef057dce --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp @@ -0,0 +1,300 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "acl_fullyconnected.hpp" +#include "acl_utils.hpp" +#include "nodes/executors/executor.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "utils/debug_capabilities.h" +#include "nodes/executors/debug_messages.hpp" +#include "nodes/executors/implementation_utils.hpp" +#include "nodes/common/cpu_convert.h" +#include "memory_desc/cpu_memory_desc_utils.h" + +namespace ov { +namespace intel_cpu { + +VectorDims acl_fc_executor::makeDummyInputDims(const Shape& inShape, const Shape& wShape) { + const auto& weightDims = wShape.getStaticDims(); + + auto inMinDims = inShape.getMinDims(); + auto inMaxDims = inShape.getMaxDims(); + inMinDims.back() = weightDims.back(); + inMaxDims.back() = weightDims.back(); + + return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims(); +} + +VectorDims acl_fc_executor::makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) { + size_t activationRank = inShape.size(); + size_t channelRank = wShape.size() - 1; + // activation weight output_shape + // NCHW CoCHW NCo + // TNC CoC TNCo + // NC CoC NCo + VectorDims outputShape(out_rank, 1); + // set Co + outputShape.back() = wShape[0]; + // set batch dims + size_t batchRank = activationRank - channelRank; + size_t startIdx = out_rank - batchRank - 1; + for (size_t i = 0; i < batchRank; i++) { + outputShape[i + startIdx] = inShape[i]; + } + + return outputShape; +} + +MemoryPtr acl_fc_executor::prepareWeightMemory(const MemoryArgs &memory, + const ExecutorContext::CPtr context, + const FCAttrs &attrs, + const ACLFCAttrs& aclfcAttrs, + const PostOps &postOps) { + DEBUG_LOG("ACLFullyConnectedExecutor: prepack weights"); + const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); + const auto N = wgtDims[0]; + const auto K = wgtDims[1]; + + auto create = [&]() { + MemoryPtr final_ptr = memory.at(ARG_WEI); + // Convert weights precision + if (aclfcAttrs.isConvertedWeights) { + MemoryArgs memoryArgs; + memoryArgs[ARG_SRC_0] = memory.at(ARG_WEI); + memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), + memoryArgs[ARG_SRC_0]->getDescPtr()->cloneWithNewPrecision( + aclfcAttrs.inputPrecision)); + auto aclWeightsConverter = std::make_shared(); + if (aclWeightsConverter->update(memoryArgs)) { + aclWeightsConverter->execute(memoryArgs); + } else { + auto count_wei_elem = std::accumulate(memoryArgs[ARG_SRC_0]->getStaticDims().begin(), + memoryArgs[ARG_SRC_0]->getStaticDims().end(), + 1, + std::multiplies<>()); + cpu_convert(memoryArgs[ARG_SRC_0]->getData(), + memoryArgs[ARG_DST]->getData(), + memoryArgs[ARG_SRC_0]->getPrecision(), + memoryArgs[ARG_DST]->getPrecision(), + count_wei_elem); + } + final_ptr = memoryArgs[ARG_DST]; + } + // Packed weights + { + arm_compute::WeightFormat expectedWeightFormat; + bool isNeededReorder; + { + MemoryArgs memoryArgs; + memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); + memoryArgs[ARG_WEI] = final_ptr; + if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { + const auto& inShape = memory.at(ARG_SRC_0)->getShape(); + const auto& wShape = final_ptr->getShape(); + const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); + const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); + memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), + memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); + memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), + memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims)); + } else { + memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); + memoryArgs[ARG_DST] = memory.at(ARG_DST); + } + auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); + isNeededReorder = aclWeightsRepack->update(memoryArgs); + expectedWeightFormat = aclWeightsRepack->getOptImplWeightFormat(); + } + if (isNeededReorder) { + MemoryArgs memoryArgs; + memoryArgs[ARG_SRC_0] = final_ptr; + memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), + memoryArgs[ARG_SRC_0]->getDescPtr()->clone()); + auto aclWeightsReorder = std::make_shared( + arm_compute::WeightFormat::OHWI, expectedWeightFormat); + if (aclWeightsReorder->update(memoryArgs)) { + aclWeightsReorder->execute(memoryArgs); + final_ptr = memoryArgs[ARG_DST]; + } + } + } + // Transpose weights + if (!aclfcAttrs.weightsNonTransposed) { + auto reverse_weights_dims = memory.at(ARG_WEI)->getStaticDims(); + if (reverse_weights_dims.size() == 3) { + reverse_weights_dims = VectorDims( + {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]}); + } + std::reverse(reverse_weights_dims.begin(), reverse_weights_dims.end()); + MemoryArgs memoryArgs; + memoryArgs[ARG_SRC_0] = final_ptr; + memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), + CpuBlockedMemoryDesc(final_ptr->getPrecision(), + intel_cpu::Shape(reverse_weights_dims))); + auto aclWeightsTranspose = std::make_shared(); + if (aclWeightsTranspose->update(memoryArgs)) { + aclWeightsTranspose->execute(memoryArgs); + final_ptr = memoryArgs[ARG_DST]; + } + } + DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing"); + return final_ptr; + }; + + auto weightCache = context->getWeightsCache(); + if (weightCache != nullptr) { + std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K); + const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" + + std::to_string(reinterpret_cast(memory.at(ARG_WEI)->getData())); + DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash); + return *weightCache->findOrCreate(string_hash, create); + } + + DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available"); + return create(); +} + +static bool checkPostOps(const PostOps &postOps) { + // Add postops + if (!postOps.empty() && postOps.size() == 1) { + if (const auto activation = std::dynamic_pointer_cast(postOps[0])) { + if (checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()))) { + return true; + } + } + } + return false; +} + +static void initFCAttrs(const FCAttrs &attrs, + ACLTensorAttrs& aclTensorAttrs, + ACLFCAttrs& aclfcAttrs, + const MemoryArgs &memory, + arm_compute::FullyConnectedLayerInfo& fullyConnectedLayerInfo, + const PostOps &postOps) { + aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); + fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); + aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision(); + fullyConnectedLayerInfo.transpose_weights = false; + aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed; + + if (checkPostOps(postOps)) { + auto activation = std::dynamic_pointer_cast(postOps[0]); + fullyConnectedLayerInfo.activation_info = getActivationLayerInfo( + convertToEltwiseAlgorithm(activation->type()), + activation->alpha(), activation->beta(), activation->gamma()); + } + + if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) { + aclfcAttrs.isConvertedWeights = true; + } +} + +void acl_fc_executor::updateFCTensorsShapes(ACLShapes& aclMemoryShapes) { + if (aclMemoryShapes[ACLArgs::ACL_WEI].num_dimensions() == 3U) { + aclMemoryShapes[ACLArgs::ACL_WEI] = arm_compute::TensorShape( + {aclMemoryShapes[ACLArgs::ACL_WEI][0] * aclMemoryShapes[ACLArgs::ACL_WEI][1], + aclMemoryShapes[ACLArgs::ACL_WEI][2]}); + } + + if (one_of(aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions(), 3U, 4U)) { + aclMemoryShapes[ACLArgs::ACL_SRC_0] = arm_compute::TensorShape({ + aclMemoryShapes[ACLArgs::ACL_WEI][0], + aclMemoryShapes[ACLArgs::ACL_SRC_0].total_size() / aclMemoryShapes[ACLArgs::ACL_WEI][0]}); + } + + if (one_of(aclMemoryShapes[ACLArgs::ACL_DST].num_dimensions(), 3U, 4U)) { + aclMemoryShapes[ACLArgs::ACL_DST] = arm_compute::TensorShape({ + aclMemoryShapes[ACLArgs::ACL_WEI][1], + aclMemoryShapes[ACLArgs::ACL_SRC_0][1]}); + } + + std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]); +} + +arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { + return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + arm_compute::ConvertPolicy::SATURATE); +} + +ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors &aclMemoryTensors) { + auto neCast = std::make_unique(); + neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get(), + arm_compute::ConvertPolicy::SATURATE); + return neCast; +} + + +arm_compute::Status acl_fc_executor::ACLWeightsTranspose::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { + return arm_compute::NETranspose::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get()); +} + +ACLFunction acl_fc_executor::ACLWeightsTranspose::configureFunction(const ACLTensors &aclMemoryTensors) { + auto neTranspose = std::make_unique(); + neTranspose->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get()); + return neTranspose; +} + +acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs, + const PostOps &postOps, + const MemoryArgs &memory) { + initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); +} + +void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes &aclMemoryShapes) { + updateFCTensorsShapes(aclMemoryShapes); +} + +arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { + if (aclfcAttrs.isConvertedWeights) { + aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); + } + return arm_compute::NEFullyConnectedLayer::has_opt_impl( + expectedWeightFormat, + aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_WEI].get(), + aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + fullyConnectedLayerInfo, + weightsInfo); +} + +ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors &aclMemoryTensors) { + return std::make_unique(); +} + +arm_compute::Status acl_fc_executor::ACLWeightsReorder::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { +#if defined(OPENVINO_ARCH_ARM64) + return arm_compute::NEReorderLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + inWeightFormat, + outWeightFormat); +#else + return arm_compute::NECopy::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get()); +#endif +} + +ACLFunction acl_fc_executor::ACLWeightsReorder::configureFunction(const ACLTensors &aclMemoryTensors) { +#if defined(OPENVINO_ARCH_ARM64) + auto neReorderLayer = std::make_unique(); + neReorderLayer->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get(), + inWeightFormat, + outWeightFormat); + return neReorderLayer; +#else + auto neCopy = std::make_unique(); + neCopy->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get()); + return neCopy; +#endif +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp new file mode 100644 index 00000000000000..6f558ecf053bca --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp @@ -0,0 +1,82 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "acl_common_executor.hpp" +#include "nodes/executors/fullyconnected_config.hpp" + +namespace ov { +namespace intel_cpu { + +struct ACLFCAttrs { + ov::element::Type inputPrecision; + bool isConvertedWeights = false; + bool weightsNonTransposed; +}; + +namespace acl_fc_executor { + +VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape); + +VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank); + +MemoryPtr prepareWeightMemory(const MemoryArgs &memory, + const ExecutorContext::CPtr context, + const FCAttrs &attrs, + const ACLFCAttrs& aclfcAttrs, + const PostOps &postOps); + +void updateFCTensorsShapes(ACLShapes& aclMemoryShapes); + +class ACLWeightsConverter : public ACLCommonExecutor { +public: + ACLWeightsConverter() = default; + void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} + arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; + ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; +}; + +class ACLWeightsTranspose : public ACLCommonExecutor { +public: + ACLWeightsTranspose() = default; + void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} + arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; + ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; +}; + +class ACLWeightFormatGenerator : public ACLCommonExecutor { +public: + ACLWeightFormatGenerator(const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory); + void updateTensorsShapes(ACLShapes& aclMemoryShapes) override; + arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; + ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; + arm_compute::WeightFormat getOptImplWeightFormat() { + return expectedWeightFormat; + } +private: + arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; + arm_compute::WeightsInfo weightsInfo; + ACLFCAttrs aclfcAttrs; + arm_compute::WeightFormat expectedWeightFormat; +}; + +class ACLWeightsReorder : public ACLCommonExecutor { +public: + ACLWeightsReorder(arm_compute::WeightFormat inWeightFormat, + arm_compute::WeightFormat outWeightFormat) + : inWeightFormat(inWeightFormat), outWeightFormat(outWeightFormat) {} + void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} + arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; + ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; +private: + arm_compute::WeightFormat inWeightFormat; + arm_compute::WeightFormat outWeightFormat; +}; + +} // namespace acl_fc_executor +} // namespace intel_cpu +} // namespace ov