[CPU] [ACL] FullyConnected fp32 executor refactoring

eshoguli · Jul 31, 2024 · 1ecaa15 · 1ecaa15
1 parent b83fb38
commit 1ecaa15
Show file tree

Hide file tree

Showing 4 changed files with 321 additions and 251 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
@@ -13,93 +13,6 @@
 namespace ov {
 namespace intel_cpu {
 
-static MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
-                                     const ExecutorContext::CPtr context,
-                                     const FCAttrs &attrs,
-                                     const ACLTensorAttrs& aclTensorAttrs,
-                                     const ACLFCAttrs& aclfcAttrs,
-                                     const PostOps &postOps) {
-    DEBUG_LOG("ACLFullyConnectedExecutor: prepack weights");
-    const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims();
-    const auto N = wgtDims[0];
-    const auto K = wgtDims[1];
-
-    auto create = [&]() {
-        MemoryPtr final_ptr = memory.at(ARG_WEI);
-        if (aclfcAttrs.isConvertedWeights) {
-            MemoryArgs memoryArgs;
-            memoryArgs[ARG_SRC_0] = memory.at(ARG_WEI);
-            memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
-                                                           CpuBlockedMemoryDesc(aclfcAttrs.inputPrecision,
-                                                                                memory.at(ARG_WEI)->getShape()));
-            auto aclWeightsConverter = std::make_shared<acl_fc_executor::ACLWeightsConverter>();
-            if (aclWeightsConverter->update(memoryArgs)) {
-                aclWeightsConverter->execute(memoryArgs);
-                final_ptr = memoryArgs[ARG_DST];
-            }
-        }
-        if (!aclfcAttrs.weightsNonTransposed) {
-            auto reverse_weights_dims = memory.at(ARG_WEI)->getStaticDims();
-            if (reverse_weights_dims.size() == 3) {
-                reverse_weights_dims = VectorDims(
-                        {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]});
-            }
-            std::reverse(reverse_weights_dims.begin(), reverse_weights_dims.end());
-            MemoryArgs memoryArgs;
-            memoryArgs[ARG_SRC_0] = final_ptr;
-            memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
-                                                           CpuBlockedMemoryDesc(final_ptr->getPrecision(),
-                                                                                intel_cpu::Shape(reverse_weights_dims)));
-            auto aclWeightsTranspose = std::make_shared<acl_fc_executor::ACLWeightsTranspose>();
-            if (aclWeightsTranspose->update(memoryArgs)) {
-                aclWeightsTranspose->execute(memoryArgs);
-                final_ptr = memoryArgs[ARG_DST];
-            }
-        }
-        if (!memory.at(ARG_SRC)->getShape().isDynamic()) {
-            arm_compute::WeightFormat expectedWeightFormat;
-            bool isNeededReorder;
-            {
-                MemoryArgs memoryArgs;
-                memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0);
-                memoryArgs[ARG_BIAS]  = memory.at(ARG_BIAS);
-                memoryArgs[ARG_WEI]   = final_ptr;
-                memoryArgs[ARG_DST]   = memory.at(ARG_DST);
-                auto aclWeightsRepack = std::make_shared<acl_fc_executor::ACLWeightFormatGenerator>(attrs, postOps, memoryArgs);
-                isNeededReorder = aclWeightsRepack->update(memoryArgs);
-                expectedWeightFormat = aclWeightsRepack->getOptImplWeightFormat();
-            }
-            if (isNeededReorder) {
-                MemoryArgs memoryArgs;
-                memoryArgs[ARG_SRC_0] = final_ptr;
-                memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
-                                                                      CpuBlockedMemoryDesc(final_ptr->getPrecision(),
-                                                                                           final_ptr->getShape()));
-                auto aclWeightsReorder = std::make_shared<acl_fc_executor::ACLWeightsReorder>(
-                        arm_compute::WeightFormat::OHWI, expectedWeightFormat);
-                if (aclWeightsReorder->update(memoryArgs)) {
-                    aclWeightsReorder->execute(memoryArgs);
-                    final_ptr = memoryArgs[ARG_DST];
-                }
-            }
-        }
-        DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing");
-        return final_ptr;
-    };
-
-    auto weightCache = context->getWeightsCache();
-    if (weightCache != nullptr) {
-        std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K);
-        const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" +
-                                        std::to_string(reinterpret_cast<uint64_t>(memory.at(ARG_WEI)->getData()));
-        DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash);
-        return *weightCache->findOrCreate(string_hash, create);
-    }
-
-    DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available");
-    return create();
-}
-
 static void initFCAttrs(const FCAttrs &attrs,
                         ACLTensorAttrs& aclTensorAttrs,
                         ACLFCAttrs& aclfcAttrs,
@@ -132,7 +45,7 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs,
                                                      const MemoryArgs &memory,
                                                      const ExecutorContext::CPtr context) {
     initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps);
-    packedWeights = prepareWeightMemory(memory, context, attrs, aclTensorAttrs, aclfcAttrs, postOps);
+    packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclTensorAttrs, aclfcAttrs, postOps);
 }
 
 bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
@@ -144,30 +57,8 @@ bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
     return true;
 }
 
-static void updateFCTensorsShapes(ACLMemoryShapes& aclMemoryShapes) {
-    if (aclMemoryShapes[ACLArgs::ACL_WEI].num_dimensions() == 3U) {
-        aclMemoryShapes[ACLArgs::ACL_WEI] = arm_compute::TensorShape(
-                {aclMemoryShapes[ACLArgs::ACL_WEI][0] * aclMemoryShapes[ACLArgs::ACL_WEI][1],
-                 aclMemoryShapes[ACLArgs::ACL_WEI][2]});
-    }
-
-    if (one_of(aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions(), 3U, 4U)) {
-        aclMemoryShapes[ACLArgs::ACL_SRC_0] = arm_compute::TensorShape({
-            aclMemoryShapes[ACLArgs::ACL_WEI][0],
-            aclMemoryShapes[ACLArgs::ACL_SRC_0].total_size() / aclMemoryShapes[ACLArgs::ACL_WEI][0]});
-    }
-
-    if (one_of(aclMemoryShapes[ACLArgs::ACL_DST].num_dimensions(), 3U, 4U)) {
-        aclMemoryShapes[ACLArgs::ACL_DST] = arm_compute::TensorShape({
-            aclMemoryShapes[ACLArgs::ACL_WEI][1],
-            aclMemoryShapes[ACLArgs::ACL_SRC_0][1]});
-    }
-
-    std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]);
-}
-
 void ACLFullyConnectedExecutor::updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) {
-    updateFCTensorsShapes(aclMemoryShapes);
+    acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes);
 }
 
 arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) {
@@ -200,88 +91,5 @@ ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLMemoryTensors
     return neFC;
 }
 
-arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) {
-    return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-                                         aclMemoryInfos[ACLArgs::ACL_DST].get(),
-                                         arm_compute::ConvertPolicy::SATURATE);
-}
-
-ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLMemoryTensors &aclMemoryTensors) {
-    auto neCast = std::make_unique<arm_compute::NECast>();
-    neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
-                      aclMemoryTensors[ACLArgs::ACL_DST].get(),
-                      arm_compute::ConvertPolicy::SATURATE);
-    return neCast;
-}
-
-
-arm_compute::Status acl_fc_executor::ACLWeightsTranspose::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) {
-    return arm_compute::NETranspose::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-                                              aclMemoryInfos[ACLArgs::ACL_DST].get());
-}
-
-ACLFunction acl_fc_executor::ACLWeightsTranspose::configureFunction(const ACLMemoryTensors &aclMemoryTensors) {
-    auto neTranspose = std::make_unique<arm_compute::NETranspose>();
-    neTranspose->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
-                           aclMemoryTensors[ACLArgs::ACL_DST].get());
-    return neTranspose;
-}
-
-acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs,
-                                                                    const PostOps &postOps,
-                                                                    const MemoryArgs &memory) {
-    initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps);
-}
-
-void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLMemoryShapes &aclMemoryShapes) {
-    updateFCTensorsShapes(aclMemoryShapes);
-}
-
-arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) {
-    if (aclfcAttrs.isConvertedWeights) {
-        aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type());
-    }
-    return arm_compute::NEFullyConnectedLayer::has_opt_impl(
-            expectedWeightFormat,
-            aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-            aclMemoryInfos[ACLArgs::ACL_WEI].get(),
-            aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
-            aclMemoryInfos[ACLArgs::ACL_DST].get(),
-            fullyConnectedLayerInfo,
-            weightsInfo);
-}
-
-ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLMemoryTensors &aclMemoryTensors) {
-    return std::make_unique<arm_compute::NEFullyConnectedLayer>();
-}
-
-arm_compute::Status acl_fc_executor::ACLWeightsReorder::validateTensorsInfo(const ACLMemoryInfo &aclMemoryInfos) {
-#if defined(OPENVINO_ARCH_ARM64)
-    return arm_compute::NEReorderLayer::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-                                                 aclMemoryInfos[ACLArgs::ACL_DST].get(),
-                                                 inWeightFormat,
-                                                 outWeightFormat);
-#else
-    return arm_compute::NECopy::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
-                                         aclMemoryInfos[ACLArgs::ACL_DST].get());
-#endif
-}
-
-ACLFunction acl_fc_executor::ACLWeightsReorder::configureFunction(const ACLMemoryTensors &aclMemoryTensors) {
-#if defined(OPENVINO_ARCH_ARM64)
-    auto neReorderLayer = std::make_unique<arm_compute::NEReorderLayer>();
-    neReorderLayer->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
-                              aclMemoryTensors[ACLArgs::ACL_DST].get(),
-                              inWeightFormat,
-                              outWeightFormat);
-    return neReorderLayer;
-#else
-    auto neCopy = std::make_unique<arm_compute::NECopy>();
-    neCopy->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
-                              aclMemoryTensors[ACLArgs::ACL_DST].get());
-    return neCopy;
-#endif
-}
-
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
@@ -6,67 +6,11 @@
 
 #include "acl_common_executor.hpp"
 #include "nodes/executors/fullyconnected_config.hpp"
+#include "acl_weights.hpp"
 
 namespace ov {
 namespace intel_cpu {
 
-struct ACLFCAttrs {
-    ov::element::Type inputPrecision;
-    bool isConvertedWeights = false;
-    bool weightsNonTransposed;
-};
-
-namespace acl_fc_executor {
-
-class ACLWeightsConverter : public ACLCommonExecutor {
-public:
-    ACLWeightsConverter() = default;
-    void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override {}
-    arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override;
-};
-
-class ACLWeightsTranspose : public ACLCommonExecutor {
-public:
-    ACLWeightsTranspose() = default;
-    void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override {}
-    arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override;
-};
-
-class ACLWeightFormatGenerator : public ACLCommonExecutor {
-public:
-    ACLWeightFormatGenerator(const FCAttrs& attrs,
-                             const PostOps& postOps,
-                             const MemoryArgs& memory);
-    void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override;
-    arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override;
-    arm_compute::WeightFormat getOptImplWeightFormat() {
-        return expectedWeightFormat;
-    }
-private:
-    arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
-    arm_compute::WeightsInfo weightsInfo;
-    ACLFCAttrs aclfcAttrs;
-    arm_compute::WeightFormat expectedWeightFormat;
-};
-
-class ACLWeightsReorder : public ACLCommonExecutor {
-public:
-    ACLWeightsReorder(arm_compute::WeightFormat inWeightFormat,
-                      arm_compute::WeightFormat outWeightFormat)
-                      : inWeightFormat(inWeightFormat), outWeightFormat(outWeightFormat) {}
-    void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override {}
-    arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override;
-    ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override;
-private:
-    arm_compute::WeightFormat inWeightFormat;
-    arm_compute::WeightFormat outWeightFormat;
-};
-
-}  // namespace acl_fc_executor
-
 class ACLFullyConnectedExecutor : public ACLCommonExecutor {
 public:
     ACLFullyConnectedExecutor(const FCAttrs& attrs,