[CPU] [ARM] FullyConnected: int8 support

eshoguli · Jul 31, 2024 · 1de5490 · 1de5490
1 parent 1ecaa15
commit 1de5490
Show file tree

Hide file tree

Showing 96 changed files with 637 additions and 53 deletions.
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
@@ -44,7 +44,8 @@ struct Config {
         Unknown
     };
 
-    bool collectPerfCounters = false;
+    // TODO: workaround to collect performance counters
+    bool collectPerfCounters = true;
     bool exclusiveAsyncRequests = false;
     SnippetsMode snippetsMode = SnippetsMode::Enable;
     std::string dumpToDot = {};
@@ -79,7 +80,7 @@ struct Config {
     LPTransformsMode lpTransformsMode = LPTransformsMode::On;
 #else
     // Currently INT8 mode is not optimized on ARM / RISCV or other non-x86 platforms, fallback to FP32 mode.
-    LPTransformsMode lpTransformsMode = LPTransformsMode::Off;
+    LPTransformsMode lpTransformsMode = LPTransformsMode::On;
 #endif
     // default inference precision
     ov::element::Type inferencePrecision = ov::element::f32;

diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp
@@ -475,6 +475,7 @@ void DnnlMemoryMngr::notifyUpdate() {
 
 StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) :
     m_eng(eng), m_pMemDesc(desc) {
+    OPENVINO_ASSERT(!desc->empty() || (desc->empty() && (data == nullptr)));
     if (desc->getPrecision() == element::string) {
         OPENVINO_THROW("[CPU] StaticMemory object cannot be created for string data.");
     }
@@ -484,7 +485,7 @@ StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const vo
 
     m_size = m_pMemDesc->getCurrentMemSize();
 
-    if (data) {
+    if (data || desc->empty()) {
         m_pMemMngr = std::make_shared<StaticMemoryMngr>(const_cast<void*>(data), m_size);
     } else {
         m_pMemMngr = std::make_shared<StaticMemoryMngr>(m_size);

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
@@ -3,6 +3,9 @@
 //
 
 #include "acl_common_executor.hpp"
+
+#include <ostream>
+
 #include "acl_utils.hpp"
 #include "nodes/executors/memory_arguments.hpp"
 #include "utils/debug_capabilities.h"
@@ -38,9 +41,9 @@ static void initACLTensorParams(const MemoryPtr& memoryPtr,
     }
 }
 
-static ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
-                              const arm_compute::DataType& dataType,
-                              const arm_compute::DataLayout& dataLayout) {
+ACLInfo ACLCommonExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                          const arm_compute::DataType& dataType,
+                                          const arm_compute::DataLayout& dataLayout) {
     ACLInfo aclMemoryInfo = nullptr;
     if (dataType != arm_compute::DataType::UNKNOWN) {
         aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>(
@@ -72,6 +75,9 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
     ACLMemoryTypes   aclDataType{};
     ACLMemoryLayouts aclDataLayout{};
     for (auto& cpu_mem_ptr : memory) {
+        if (cpu_mem_ptr.second->getSize() == 0) {
+            continue;
+        }
         const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
         initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs,
                             aclMemoryShapes[index],
@@ -108,18 +114,79 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
     configureThreadSafe([&] {
         iFunction = configureFunction(aclMemoryTensors);
     });
+
+//    for (auto& cpu_mem_ptr : memory) {
+//        const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
+//        if (aclTensorAttrs.memoryUsageIndicator[index]) {
+//            aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData());
+//        }
+//    }
     return true;
 }
 
+//namespace {
+//std::ostream& operator<<(std::ostream& os, const arm_compute::ITensorInfo* tensor_info) {
+//    const auto data_type = tensor_info->data_type();
+//    switch (data_type) {
+//        case arm_compute::DataType::S8: {
+//            return os << "S8";
+//        }
+//        case arm_compute::DataType::QSYMM8: {
+//            return os << "QSYMM8";
+//        }
+//        case arm_compute::DataType::QASYMM8: {
+//            return os << "QASYMM8";
+//        }
+//        case arm_compute::DataType::QASYMM8_SIGNED: {
+//            return os << "QASYMM8_SIGNED";
+//        }
+//        case arm_compute::DataType::S32: {
+//            return os << "S32";
+//        }
+//        case arm_compute::DataType::F32: {
+//            return os << "F32";
+//        }
+//        default: {
+//            return os << "[UNKNOWN]";
+//        }
+//    }
+//}
+//} // namespace
+
 void ACLCommonExecutor::execute(const MemoryArgs &memory) {
-    // TODO: Move import_memory() to update() function - CVS-145871
     for (auto& cpu_mem_ptr : memory) {
         const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
-        if (aclTensorAttrs.memoryUsageIndicator[index]) {
+        if (aclMemoryTensors[index]) {
             aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData());
         }
     }
+
+//    for (auto index = 0; index < aclMemoryTensors.size(); ++index) {
+//        const auto& tensor = aclMemoryTensors[index];
+//        if ((tensor == nullptr) || (index == ACLArgs::ACL_DST)) {
+//            continue;
+//        }
+//
+//        if (index == ACLArgs::ACL_SRC_0) {
+//            std::cout << "src0 ";
+//        } else if (index == ACLArgs::ACL_WEI) {
+//            std::cout << "src1 ";
+//        } else if (index == ACLArgs::ACL_BIAS) {
+//            std::cout << "biases ";
+//        } else {
+//            std::cout << "[UNKNOWN] ";
+//        }
+//        std::cout << tensor->info() << ":" << std::endl;
+//        tensor->print(std::cout);
+//    }
+
     iFunction->run();
+
+//    {
+//        std::shared_ptr<arm_compute::Tensor> tensor = aclMemoryTensors[ACLArgs::ACL_DST];
+//        std::cout << "dst " << tensor->info() << ":" << std::endl;
+//        tensor->print(std::cout);
+//    }
 }
 
 ACLCommonExecutor::~ACLCommonExecutor() {

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp
@@ -49,6 +49,11 @@ class ACLCommonExecutor : public Executor {
 
 protected:
     ACLTensorAttrs aclTensorAttrs;
+
+    virtual ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                   const arm_compute::DataType& dataType,
+                                   const arm_compute::DataLayout& dataLayout);
+
 private:
     ACLMemoryTensors aclMemoryTensors;
     ACLFunction iFunction = nullptr;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp
@@ -0,0 +1,133 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_lowp_fullyconnected.hpp"
+
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+#include "nodes/executors/debug_messages.hpp"
+#include "nodes/executors/implementation_utils.hpp"
+#include "acl_weights.hpp"
+#include "acl_utils.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+static void initFCAttrs(const FCAttrs &attrs,
+                        ACLTensorAttrs& aclTensorAttrs,
+                        ACLFCAttrs& aclfcAttrs,
+                        const MemoryArgs &memory,
+                        arm_compute::GEMMInfo& gemmInfo,
+                        const PostOps &postOps) {
+    aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
+    //fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
+    aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision();
+    //fullyConnectedLayerInfo.transpose_weights = false;
+    // ??
+    gemmInfo.set_pretranspose_B(false);
+    aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed;
+
+    // Add postops
+    if (!postOps.empty() && postOps.size() == 1) {
+        if (const auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0])) {
+            gemmInfo.set_activation_info(getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()),
+                                                                activation->alpha(),
+                                                                activation->beta(),
+                                                                activation->gamma()));
+        }
+    }
+
+    if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) {
+        aclfcAttrs.isConvertedWeights = true;
+    }
+}
+
+ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs &attrs,
+                                                             const PostOps &postOps,
+                                                             const MemoryArgs &memory,
+                                                             const ExecutorContext::CPtr context) {
+    initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps);
+    packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclTensorAttrs, aclfcAttrs, postOps);
+}
+
+bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) {
+    // TODO: check weights layout
+    const auto attrs = static_cast<FCAttrs>(config.attrs);
+    if (std::any_of(
+            attrs.dequantizationScales.begin(),
+            attrs.dequantizationScales.end(),
+            [](float value) { return value != 1.f;})) {
+        return false;
+    }
+
+    const auto src1_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_SRC))->getBlockDims();
+    const auto src2_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_WEI))->getBlockDims();
+
+    const auto precision = srcType(config);
+    VERIFY(one_of(precision, ov::element::i8, ov::element::u8), UNSUPPORTED_SRC_PRECISIONS);
+    VERIFY(postOpsNumbers(config) == 0, UNSUPPORTED_NUMBER_OF_POSTOPS);
+    VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
+    VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK);
+    VERIFY(static_cast<FCAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
+    return true;
+}
+
+void ACLLowpFullyConnectedExecutor::updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) {
+    acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes);
+}
+
+arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) {
+    const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
+            aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+            aclMemoryInfos[ACLArgs::ACL_WEI].get(),
+            nullptr, //aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
+            aclMemoryInfos[ACLArgs::ACL_DST].get(),
+            gemmInfo);
+    return matMulValid;
+}
+
+ACLFunction ACLLowpFullyConnectedExecutor::configureFunction(const ACLMemoryTensors & aclMemoryTensors) {
+    auto gemm = std::make_unique<arm_compute::NEGEMMLowpMatrixMultiplyCore>();
+    gemm->configure(
+            aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+            aclMemoryTensors[ACLArgs::ACL_WEI].get(),
+            nullptr, //aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
+            aclMemoryTensors.at(ACLArgs::ACL_DST).get(),
+            gemmInfo);
+
+    if (aclfcAttrs.isConvertedWeights || !aclfcAttrs.weightsNonTransposed) {
+        aclTensorAttrs.memoryUsageIndicator[ACLArgs::ACL_WEI] = false;
+        aclMemoryTensors[ACLArgs::ACL_WEI]->allocator()->import_memory(packedWeights->getData());
+    }
+    return gemm;
+}
+
+// TODO: move to ACLLowpExecutor
+ACLInfo ACLLowpFullyConnectedExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                                      const arm_compute::DataType& dataType,
+                                                      const arm_compute::DataLayout& dataLayout) {
+    arm_compute::DataType result;
+    switch (dataType) {
+        case arm_compute::DataType::S8: {
+            result = arm_compute::DataType::QASYMM8_SIGNED;
+            break;
+        }
+        case arm_compute::DataType::U8: {
+            result = arm_compute::DataType::QASYMM8;
+            break;
+        }
+        default: {
+            result = dataType;
+            break;
+        }
+    }
+
+    return ACLCommonExecutor::initTensorInfo(tensorShape, result, dataLayout);
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "acl_common_executor.hpp"
+#include "nodes/executors/fullyconnected_config.hpp"
+#include "acl_weights.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class ACLLowpFullyConnectedExecutor : public ACLCommonExecutor {
+public:
+    ACLLowpFullyConnectedExecutor(const FCAttrs& attrs,
+                                  const PostOps& postOps,
+                                  const MemoryArgs& memory,
+                                  const ExecutorContext::CPtr context);
+
+    static bool supports(const FCConfig& config);
+
+    void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override;
+
+    arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override;
+
+    ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override;
+
+    impl_desc_type implType() const override {
+        return impl_desc_type::gemm_acl;
+    }
+
+protected:
+    ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                           const arm_compute::DataType& dataType,
+                           const arm_compute::DataLayout& dataLayout) override;
+
+private:
+    arm_compute::GEMMInfo gemmInfo;
+    arm_compute::WeightsInfo weightsInfo;
+
+    MemoryCPtr packedWeights;
+    ACLFCAttrs aclfcAttrs;
+};
+
+using ACLLowpFullyConnectedExecutorPtr = std::shared_ptr<ACLLowpFullyConnectedExecutor>;
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
@@ -110,7 +110,7 @@ inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxis
 * @param precision precision to be converted
 * @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType
 */
-inline arm_compute::DataType precisionToAclDataType(ov::element::Type precision) {
+inline arm_compute::DataType precisionToAclDataType(const ov::element::Type& precision) {
     switch (precision) {
         case ov::element::i8:    return arm_compute::DataType::S8;
         case ov::element::u8:    return arm_compute::DataType::U8;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp b/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp
@@ -17,6 +17,7 @@
 #define UNSUPPORTED_DST_RANK " unsupported dst rank"
 #define UNSUPPORTED_DST_STRIDES " unsupported dst strides"
 #define HEURISTICS_MISMATCH " heuristics mismatch"
+#define UNSUPPORTED_PER_CHANNEL_QUANTIZATION " unsupported per-channel quantization"
 
 #define VERIFY(condition, ...) \
     do { \