diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index fa6881123af22b..8976271fb7f8a3 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -13,93 +13,6 @@ namespace ov { namespace intel_cpu { -static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, - const ExecutorContext::CPtr context, - const FCAttrs &attrs, - const ACLTensorAttrs& aclTensorAttrs, - const ACLFCAttrs& aclfcAttrs, - const PostOps &postOps) { - DEBUG_LOG("ACLFullyConnectedExecutor: prepack weights"); - const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); - const auto N = wgtDims[0]; - const auto K = wgtDims[1]; - - auto create = [&]() { - MemoryPtr final_ptr = memory.at(ARG_WEI); - if (aclfcAttrs.isConvertedWeights) { - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = memory.at(ARG_WEI); - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - CpuBlockedMemoryDesc(aclfcAttrs.inputPrecision, - memory.at(ARG_WEI)->getShape())); - auto aclWeightsConverter = std::make_shared(); - if (aclWeightsConverter->update(memoryArgs)) { - aclWeightsConverter->execute(memoryArgs); - final_ptr = memoryArgs[ARG_DST]; - } - } - if (!aclfcAttrs.weightsNonTransposed) { - auto reverse_weights_dims = memory.at(ARG_WEI)->getStaticDims(); - if (reverse_weights_dims.size() == 3) { - reverse_weights_dims = VectorDims( - {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]}); - } - std::reverse(reverse_weights_dims.begin(), reverse_weights_dims.end()); - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = final_ptr; - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - CpuBlockedMemoryDesc(final_ptr->getPrecision(), - intel_cpu::Shape(reverse_weights_dims))); - auto aclWeightsTranspose = std::make_shared(); - if (aclWeightsTranspose->update(memoryArgs)) { - aclWeightsTranspose->execute(memoryArgs); - final_ptr = memoryArgs[ARG_DST]; - } - } - if (!memory.at(ARG_SRC)->getShape().isDynamic()) { - arm_compute::WeightFormat expectedWeightFormat; - bool isNeededReorder; - { - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); - memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); - memoryArgs[ARG_WEI] = final_ptr; - memoryArgs[ARG_DST] = memory.at(ARG_DST); - auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); - isNeededReorder = aclWeightsRepack->update(memoryArgs); - expectedWeightFormat = aclWeightsRepack->getOptImplWeightFormat(); - } - if (isNeededReorder) { - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC_0] = final_ptr; - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - CpuBlockedMemoryDesc(final_ptr->getPrecision(), - final_ptr->getShape())); - auto aclWeightsReorder = std::make_shared( - arm_compute::WeightFormat::OHWI, expectedWeightFormat); - if (aclWeightsReorder->update(memoryArgs)) { - aclWeightsReorder->execute(memoryArgs); - final_ptr = memoryArgs[ARG_DST]; - } - } - } - DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing"); - return final_ptr; - }; - - auto weightCache = context->getWeightsCache(); - if (weightCache != nullptr) { - std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K); - const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" + - std::to_string(reinterpret_cast(memory.at(ARG_WEI)->getData())); - DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash); - return *weightCache->findOrCreate(string_hash, create); - } - - DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available"); - return create(); -} - static void initFCAttrs(const FCAttrs &attrs, ACLTensorAttrs& aclTensorAttrs, ACLFCAttrs& aclfcAttrs, @@ -132,7 +45,7 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const MemoryArgs &memory, const ExecutorContext::CPtr context) { initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); - packedWeights = prepareWeightMemory(memory, context, attrs, aclTensorAttrs, aclfcAttrs, postOps); + packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclTensorAttrs, aclfcAttrs, postOps); } bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { @@ -144,30 +57,8 @@ bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { return true; } -static void updateFCTensorsShapes(ACLMemoryShapes& aclMemoryShapes) { - if (aclMemoryShapes[ACLArgs::ACL_WEI].num_dimensions() == 3U) { - aclMemoryShapes[ACLArgs::ACL_WEI] = arm_compute::TensorShape( - {aclMemoryShapes[ACLArgs::ACL_WEI][0] * aclMemoryShapes[ACLArgs::ACL_WEI][1], - aclMemoryShapes[ACLArgs::ACL_WEI][2]}); - } - - if (one_of(aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions(), 3U, 4U)) { - aclMemoryShapes[ACLArgs::ACL_SRC_0] = arm_compute::TensorShape({ - aclMemoryShapes[ACLArgs::ACL_WEI][0], - aclMemoryShapes[ACLArgs::ACL_SRC_0].total_size() / aclMemoryShapes[ACLArgs::ACL_WEI][0]}); - } - - if (one_of(aclMemoryShapes[ACLArgs::ACL_DST].num_dimensions(), 3U, 4U)) { - aclMemoryShapes[ACLArgs::ACL_DST] = arm_compute::TensorShape({ - aclMemoryShapes[ACLArgs::ACL_WEI][1], - aclMemoryShapes[ACLArgs::ACL_SRC_0][1]}); - } - - std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]); -} - void ACLFullyConnectedExecutor::updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) { - updateFCTensorsShapes(aclMemoryShapes); + acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes); } arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) { @@ -200,88 +91,5 @@ ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLMemoryTensors return neFC; } -arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) { - return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - arm_compute::ConvertPolicy::SATURATE); -} - -ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLMemoryTensors &aclMemoryTensors) { - auto neCast = std::make_unique(); - neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get(), - arm_compute::ConvertPolicy::SATURATE); - return neCast; -} - - -arm_compute::Status acl_fc_executor::ACLWeightsTranspose::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) { - return arm_compute::NETranspose::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get()); -} - -ACLFunction acl_fc_executor::ACLWeightsTranspose::configureFunction(const ACLMemoryTensors &aclMemoryTensors) { - auto neTranspose = std::make_unique(); - neTranspose->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get()); - return neTranspose; -} - -acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs, - const PostOps &postOps, - const MemoryArgs &memory) { - initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); -} - -void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLMemoryShapes &aclMemoryShapes) { - updateFCTensorsShapes(aclMemoryShapes); -} - -arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) { - if (aclfcAttrs.isConvertedWeights) { - aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); - } - return arm_compute::NEFullyConnectedLayer::has_opt_impl( - expectedWeightFormat, - aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_WEI].get(), - aclMemoryInfos[ACLArgs::ACL_BIAS].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - fullyConnectedLayerInfo, - weightsInfo); -} - -ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLMemoryTensors &aclMemoryTensors) { - return std::make_unique(); -} - -arm_compute::Status acl_fc_executor::ACLWeightsReorder::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) { -#if defined(OPENVINO_ARCH_ARM64) - return arm_compute::NEReorderLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - inWeightFormat, - outWeightFormat); -#else - return arm_compute::NECopy::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get()); -#endif -} - -ACLFunction acl_fc_executor::ACLWeightsReorder::configureFunction(const ACLMemoryTensors &aclMemoryTensors) { -#if defined(OPENVINO_ARCH_ARM64) - auto neReorderLayer = std::make_unique(); - neReorderLayer->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get(), - inWeightFormat, - outWeightFormat); - return neReorderLayer; -#else - auto neCopy = std::make_unique(); - neCopy->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get()); - return neCopy; -#endif -} - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp index 7d8ac83d23218f..7685c0fc039189 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp @@ -6,67 +6,11 @@ #include "acl_common_executor.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "acl_weights.hpp" namespace ov { namespace intel_cpu { -struct ACLFCAttrs { - ov::element::Type inputPrecision; - bool isConvertedWeights = false; - bool weightsNonTransposed; -}; - -namespace acl_fc_executor { - -class ACLWeightsConverter : public ACLCommonExecutor { -public: - ACLWeightsConverter() = default; - void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; -}; - -class ACLWeightsTranspose : public ACLCommonExecutor { -public: - ACLWeightsTranspose() = default; - void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; -}; - -class ACLWeightFormatGenerator : public ACLCommonExecutor { -public: - ACLWeightFormatGenerator(const FCAttrs& attrs, - const PostOps& postOps, - const MemoryArgs& memory); - void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override; - arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; - arm_compute::WeightFormat getOptImplWeightFormat() { - return expectedWeightFormat; - } -private: - arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; - arm_compute::WeightsInfo weightsInfo; - ACLFCAttrs aclfcAttrs; - arm_compute::WeightFormat expectedWeightFormat; -}; - -class ACLWeightsReorder : public ACLCommonExecutor { -public: - ACLWeightsReorder(arm_compute::WeightFormat inWeightFormat, - arm_compute::WeightFormat outWeightFormat) - : inWeightFormat(inWeightFormat), outWeightFormat(outWeightFormat) {} - void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; -private: - arm_compute::WeightFormat inWeightFormat; - arm_compute::WeightFormat outWeightFormat; -}; - -} // namespace acl_fc_executor - class ACLFullyConnectedExecutor : public ACLCommonExecutor { public: ACLFullyConnectedExecutor(const FCAttrs& attrs, diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp new file mode 100644 index 00000000000000..523c1fbc0f1397 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.cpp @@ -0,0 +1,237 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "acl_weights.hpp" + +#include "acl_utils.hpp" +#include "nodes/executors/executor.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "utils/debug_capabilities.h" +#include "nodes/executors/debug_messages.hpp" +#include "nodes/executors/implementation_utils.hpp" + +namespace ov { +namespace intel_cpu { + +MemoryPtr acl_fc_executor::prepareWeightMemory(const MemoryArgs &memory, + const ExecutorContext::CPtr context, + const FCAttrs &attrs, + const ACLTensorAttrs& aclTensorAttrs, + const ACLFCAttrs& aclfcAttrs, + const PostOps &postOps) { + DEBUG_LOG("ACLFullyConnectedExecutor: prepack weights"); + const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); + const auto N = wgtDims[0]; + const auto K = wgtDims[1]; + + auto create = [&]() { + MemoryPtr final_ptr = memory.at(ARG_WEI); + if (aclfcAttrs.isConvertedWeights) { + MemoryArgs memoryArgs; + memoryArgs[ARG_SRC_0] = memory.at(ARG_WEI); + memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), + CpuBlockedMemoryDesc(aclfcAttrs.inputPrecision, + memory.at(ARG_WEI)->getShape())); + auto aclWeightsConverter = std::make_shared(); + if (aclWeightsConverter->update(memoryArgs)) { + aclWeightsConverter->execute(memoryArgs); + final_ptr = memoryArgs[ARG_DST]; + } + } + if (!aclfcAttrs.weightsNonTransposed) { + auto reverse_weights_dims = memory.at(ARG_WEI)->getStaticDims(); + if (reverse_weights_dims.size() == 3) { + reverse_weights_dims = VectorDims( + {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]}); + } + std::reverse(reverse_weights_dims.begin(), reverse_weights_dims.end()); + MemoryArgs memoryArgs; + memoryArgs[ARG_SRC_0] = final_ptr; + memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), + CpuBlockedMemoryDesc(final_ptr->getPrecision(), + intel_cpu::Shape(reverse_weights_dims))); + auto aclWeightsTranspose = std::make_shared(); + if (aclWeightsTranspose->update(memoryArgs)) { + aclWeightsTranspose->execute(memoryArgs); + final_ptr = memoryArgs[ARG_DST]; + } + } + if (!memory.at(ARG_SRC)->getShape().isDynamic()) { + arm_compute::WeightFormat expectedWeightFormat; + bool isNeededReorder; + { + MemoryArgs memoryArgs; + memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); + memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); + memoryArgs[ARG_WEI] = final_ptr; + memoryArgs[ARG_DST] = memory.at(ARG_DST); + auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); + isNeededReorder = aclWeightsRepack->update(memoryArgs); + expectedWeightFormat = aclWeightsRepack->getOptImplWeightFormat(); + } + if (isNeededReorder) { + MemoryArgs memoryArgs; + memoryArgs[ARG_SRC_0] = final_ptr; + memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), + CpuBlockedMemoryDesc(final_ptr->getPrecision(), + final_ptr->getShape())); + auto aclWeightsReorder = std::make_shared( + arm_compute::WeightFormat::OHWI, expectedWeightFormat); + if (aclWeightsReorder->update(memoryArgs)) { + aclWeightsReorder->execute(memoryArgs); + final_ptr = memoryArgs[ARG_DST]; + } + } + } + DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing"); + return final_ptr; + }; + + auto weightCache = context->getWeightsCache(); + if (weightCache != nullptr) { + std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K); + const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" + + std::to_string(reinterpret_cast(memory.at(ARG_WEI)->getData())); + DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash); + return *weightCache->findOrCreate(string_hash, create); + } + + DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available"); + return create(); +} + +static void initFCAttrs(const FCAttrs &attrs, + ACLTensorAttrs& aclTensorAttrs, + ACLFCAttrs& aclfcAttrs, + const MemoryArgs &memory, + arm_compute::FullyConnectedLayerInfo& fullyConnectedLayerInfo, + const PostOps &postOps) { + aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); + fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); + aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision(); + fullyConnectedLayerInfo.transpose_weights = false; + aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed; + + // Add postops + if (!postOps.empty() && postOps.size() == 1) { + if (const auto activation = std::dynamic_pointer_cast(postOps[0])) { + fullyConnectedLayerInfo.activation_info = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()), + activation->alpha(), + activation->beta(), + activation->gamma()); + } + } + + if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) { + aclfcAttrs.isConvertedWeights = true; + } +} + +arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) { + return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + arm_compute::ConvertPolicy::SATURATE); +} + +ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLMemoryTensors &aclMemoryTensors) { + auto neCast = std::make_unique(); + neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get(), + arm_compute::ConvertPolicy::SATURATE); + return neCast; +} + + +arm_compute::Status acl_fc_executor::ACLWeightsTranspose::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) { + return arm_compute::NETranspose::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get()); +} + +ACLFunction acl_fc_executor::ACLWeightsTranspose::configureFunction(const ACLMemoryTensors &aclMemoryTensors) { + auto neTranspose = std::make_unique(); + neTranspose->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get()); + return neTranspose; +} + +acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs, + const PostOps &postOps, + const MemoryArgs &memory) { + initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); +} + +void acl_fc_executor::updateFCTensorsShapes(ACLMemoryShapes& aclMemoryShapes) { + if (aclMemoryShapes[ACLArgs::ACL_WEI].num_dimensions() == 3U) { + aclMemoryShapes[ACLArgs::ACL_WEI] = arm_compute::TensorShape( + {aclMemoryShapes[ACLArgs::ACL_WEI][0] * aclMemoryShapes[ACLArgs::ACL_WEI][1], + aclMemoryShapes[ACLArgs::ACL_WEI][2]}); + } + + if (one_of(aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions(), 3U, 4U)) { + aclMemoryShapes[ACLArgs::ACL_SRC_0] = arm_compute::TensorShape({ + aclMemoryShapes[ACLArgs::ACL_WEI][0], + aclMemoryShapes[ACLArgs::ACL_SRC_0].total_size() / aclMemoryShapes[ACLArgs::ACL_WEI][0]}); + } + + if (one_of(aclMemoryShapes[ACLArgs::ACL_DST].num_dimensions(), 3U, 4U)) { + aclMemoryShapes[ACLArgs::ACL_DST] = arm_compute::TensorShape({ + aclMemoryShapes[ACLArgs::ACL_WEI][1], + aclMemoryShapes[ACLArgs::ACL_SRC_0][1]}); + } + + std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]); +} + +void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLMemoryShapes &aclMemoryShapes) { + updateFCTensorsShapes(aclMemoryShapes); +} + +arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) { + if (aclfcAttrs.isConvertedWeights) { + aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); + } + return arm_compute::NEFullyConnectedLayer::has_opt_impl( + expectedWeightFormat, + aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_WEI].get(), + aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + fullyConnectedLayerInfo, + weightsInfo); +} + +ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLMemoryTensors &aclMemoryTensors) { + return std::make_unique(); +} + +arm_compute::Status acl_fc_executor::ACLWeightsReorder::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) { +#if defined(OPENVINO_ARCH_ARM64) + return arm_compute::NEReorderLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + inWeightFormat, + outWeightFormat); +#else + return arm_compute::NECopy::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get()); +#endif +} + +ACLFunction acl_fc_executor::ACLWeightsReorder::configureFunction(const ACLMemoryTensors &aclMemoryTensors) { +#if defined(OPENVINO_ARCH_ARM64) + auto neReorderLayer = std::make_unique(); + neReorderLayer->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get(), + inWeightFormat, + outWeightFormat); + return neReorderLayer; +#else + auto neCopy = std::make_unique(); + neCopy->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get()); + return neCopy; +#endif +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp new file mode 100644 index 00000000000000..6c328a2bca149f --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_weights.hpp @@ -0,0 +1,81 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "acl_weights.hpp" +#include "acl_common_executor.hpp" +#include "nodes/executors/fullyconnected_config.hpp" + +namespace ov { +namespace intel_cpu { + +struct ACLFCAttrs { + ov::element::Type inputPrecision; + bool isConvertedWeights = false; + bool weightsNonTransposed; +}; + +namespace acl_fc_executor { + +MemoryPtr prepareWeightMemory(const MemoryArgs &memory, + const ExecutorContext::CPtr context, + const FCAttrs &attrs, + const ACLTensorAttrs& aclTensorAttrs, + const ACLFCAttrs& aclfcAttrs, + const PostOps &postOps); + +void updateFCTensorsShapes(ACLMemoryShapes& aclMemoryShapes); + +class ACLWeightsConverter : public ACLCommonExecutor { +public: + ACLWeightsConverter() = default; + void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override {} + arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; + ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; +}; + +class ACLWeightsTranspose : public ACLCommonExecutor { +public: + ACLWeightsTranspose() = default; + void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override {} + arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; + ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; +}; + +class ACLWeightFormatGenerator : public ACLCommonExecutor { +public: + ACLWeightFormatGenerator(const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory); + void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override; + arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; + ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; + arm_compute::WeightFormat getOptImplWeightFormat() { + return expectedWeightFormat; + } +private: + arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; + arm_compute::WeightsInfo weightsInfo; + ACLFCAttrs aclfcAttrs; + arm_compute::WeightFormat expectedWeightFormat; +}; + +class ACLWeightsReorder : public ACLCommonExecutor { +public: + ACLWeightsReorder(arm_compute::WeightFormat inWeightFormat, + arm_compute::WeightFormat outWeightFormat) + : inWeightFormat(inWeightFormat), outWeightFormat(outWeightFormat) {} + void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override {} + arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; + ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; +private: + arm_compute::WeightFormat inWeightFormat; + arm_compute::WeightFormat outWeightFormat; +}; + +} // namespace acl_fc_executor + +} // namespace intel_cpu +} // namespace ov