[CPU] [ARM] int8: GEMM support

eshoguli · Jul 18, 2024 · 4d3f499 · 4d3f499
1 parent ea6c2b2
commit 4d3f499
Show file tree

Hide file tree

Showing 27 changed files with 772 additions and 173 deletions.
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
@@ -44,7 +44,8 @@ struct Config {
         Unknown
     };
 
-    bool collectPerfCounters = false;
+    // TODO: workaround to collect performance counters
+    bool collectPerfCounters = true;
     bool exclusiveAsyncRequests = false;
     SnippetsMode snippetsMode = SnippetsMode::Enable;
     std::string dumpToDot = {};

diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp
@@ -475,6 +475,7 @@ void DnnlMemoryMngr::notifyUpdate() {
 
 StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) :
     m_eng(eng), m_pMemDesc(desc) {
+    OPENVINO_ASSERT(!desc->empty() || (desc->empty() && (data == nullptr)));
     if (desc->getPrecision() == element::string) {
         OPENVINO_THROW("[CPU] StaticMemory object cannot be created for string data.");
     }
@@ -484,7 +485,7 @@ StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const vo
 
     m_size = m_pMemDesc->getCurrentMemSize();
 
-    if (data) {
+    if (data || desc->empty()) {
         m_pMemMngr = std::make_shared<StaticMemoryMngr>(const_cast<void*>(data), m_size);
     } else {
         m_pMemMngr = std::make_shared<StaticMemoryMngr>(m_size);

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "src/core/utils/quantization/AsymmHelpers.h"
+
 #include "acl_common_executor.hpp"
 #include "acl_utils.hpp"
 #include "nodes/executors/memory_arguments.hpp"
@@ -66,7 +68,16 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
     ACLMemoryTypes   aclDataType{};
     ACLMemoryLayouts aclDataLayout{};
     for (auto& cpu_mem_ptr : memory) {
+        // TODO: don't init empty tensor
+        if (cpu_mem_ptr.second->getSize() == 0) {
+            continue;
+        }
         const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
+
+        if (index == ACLArgs::ACL_DST) {
+            std::cout << std::endl;
+        }
+
         initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs,
                             aclMemoryShapes[index],
                             aclDataType[index],
@@ -79,6 +90,9 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
     // Initialize arm_compute::TensorInfo objects
     ACLMemoryInfo aclMemoryInfos;
     for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
+        if (i == ACLArgs::ACL_DST) {
+            std::cout << std::endl;
+        }
         aclMemoryInfos[i] = initTensorInfo(aclMemoryShapes[i], aclDataType[i], aclDataLayout[i]);
     }
 
@@ -108,6 +122,7 @@ void ACLCommonExecutor::execute(const MemoryArgs &memory) {
             aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData());
         }
     }
+
     iFunction->run();
 }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
@@ -19,9 +19,6 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const
     aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
     fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
     fullyConnectedLayerInfo.transpose_weights = !attrs.weightsNonTransposed;
-    if (!attrs.dequantizationScales.empty()) {
-        dequantizationScale = attrs.dequantizationScales[0];
-    }
 
     // Add postops
     if (!postOps.empty() && postOps.size() == 1) {
@@ -35,20 +32,10 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const
 }
 
 bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
-    // issue #<create and put number here>
-    const auto attrs = static_cast<FCAttrs>(config.attrs);
-    if (std::any_of(
-            attrs.dequantizationScales.begin(),
-            attrs.dequantizationScales.end(),
-            [](float value) { return value != 1.f;})) {
-        return false;
-    }
-
-    VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32, ov::element::i8), UNSUPPORTED_SRC_PRECISIONS);
+    VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_SRC_PRECISIONS);
     VERIFY(postOpsNumbers(config) < 2,          UNSUPPORTED_NUMBER_OF_POSTOPS);
     VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
     VERIFY(one_of(weiRank(config), 2U, 3U),     UNSUPPORTED_WEI_RANK);
-    VERIFY(static_cast<FCAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
     return true;
 }
 
@@ -87,43 +74,16 @@ arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLMemo
 }
 
 ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLMemoryTensors & aclMemoryTensors) {
-    const auto dstTensor = aclMemoryTensors.at(ACLArgs::ACL_DST).get();
-    if (dequantizationScale != 1.0) {
-        dstTensor->info()->set_quantization_info(arm_compute::QuantizationInfo(dequantizationScale, 0));
-    }
-
     auto neFC = std::make_unique<arm_compute::NEFullyConnectedLayer>();
     neFC->configure(
             aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
             aclMemoryTensors[ACLArgs::ACL_WEI].get(),
             aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
-            dstTensor,
+            aclMemoryTensors[ACLArgs::ACL_DST].get(),
             fullyConnectedLayerInfo,
             weightsInfo);
     return neFC;
 }
 
-ACLInfo ACLFullyConnectedExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
-                                                  const arm_compute::DataType& dataType,
-                                                  const arm_compute::DataLayout& dataLayout) {
-    arm_compute::DataType fcDataType;
-    switch (dataType) {
-        case arm_compute::DataType::S8: {
-            fcDataType = arm_compute::DataType::QASYMM8_SIGNED;
-            break;
-        }
-        case arm_compute::DataType::U8: {
-            fcDataType = arm_compute::DataType::QASYMM8;
-            break;
-        }
-        default: {
-            fcDataType = dataType;
-            break;
-        }
-    }
-
-    return ACLCommonExecutor::initTensorInfo(tensorShape, fcDataType, dataLayout);
-}
-
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
@@ -28,16 +28,9 @@ class ACLFullyConnectedExecutor : public ACLCommonExecutor {
     impl_desc_type implType() const override {
         return impl_desc_type::gemm_acl;
     }
-
-protected:
-    ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
-                           const arm_compute::DataType& dataType,
-                           const arm_compute::DataLayout& dataLayout) override;
-
 private:
     arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
     arm_compute::WeightsInfo weightsInfo;
-    float dequantizationScale = 1.f;
 };
 
 using ACLFullyConnectedExecutorPtr = std::shared_ptr<ACLFullyConnectedExecutor>;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_gemm.hpp"
+
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+#include "nodes/executors/debug_messages.hpp"
+#include "nodes/executors/implementation_utils.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+ACLGEMMExecutor::ACLGEMMExecutor(const GEMMAttrs &attrs,
+                                 const PostOps &postOps,
+                                 const MemoryArgs &memory,
+                                 const ExecutorContext::CPtr context) {
+    aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
+}
+
+bool ACLGEMMExecutor::supports(const GEMMConfig &config) {
+    // TODO: check weights layout
+    const auto attrs = static_cast<GEMMAttrs>(config.attrs);
+    if (std::any_of(
+            attrs.dequantizationScales.begin(),
+            attrs.dequantizationScales.end(),
+            [](float value) { return value != 1.f;})) {
+        return false;
+    }
+
+    const auto src1_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_SRC))->getBlockDims();
+    const auto src2_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_WEI))->getBlockDims();
+
+    VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32, ov::element::i8, ov::element::u8), UNSUPPORTED_SRC_PRECISIONS);
+    VERIFY(postOpsNumbers(config) < 2,          UNSUPPORTED_NUMBER_OF_POSTOPS);
+    VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
+    VERIFY(one_of(weiRank(config), 2U, 3U, 4U),     UNSUPPORTED_WEI_RANK);
+    VERIFY(static_cast<GEMMAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
+    return true;
+}
+
+void ACLGEMMExecutor::updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) {}
+
+arm_compute::Status ACLGEMMExecutor::validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) {
+    const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
+            aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
+            aclMemoryInfos[ACLArgs::ACL_WEI].get(),
+            aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
+            aclMemoryInfos[ACLArgs::ACL_DST].get(),
+            gemmInfo);
+    return matMulValid;
+}
+
+ACLFunction ACLGEMMExecutor::configureFunction(const ACLMemoryTensors & aclMemoryTensors) {
+    auto matMull = std::make_unique<arm_compute::NEGEMMLowpMatrixMultiplyCore>();
+    matMull->configure(
+            aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
+            aclMemoryTensors[ACLArgs::ACL_WEI].get(),
+            // TODO: fix me
+            nullptr, //aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
+            aclMemoryTensors.at(ACLArgs::ACL_DST).get());
+    return matMull;
+}
+
+ACLInfo ACLGEMMExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                          const arm_compute::DataType& dataType,
+                                          const arm_compute::DataLayout& dataLayout) {
+    arm_compute::DataType fcDataType;
+    switch (dataType) {
+        case arm_compute::DataType::S8: {
+            fcDataType = arm_compute::DataType::QASYMM8_SIGNED;
+            break;
+        }
+        case arm_compute::DataType::U8: {
+            fcDataType = arm_compute::DataType::QASYMM8;
+            break;
+        }
+        default: {
+            fcDataType = dataType;
+            break;
+        }
+    }
+
+    return ACLCommonExecutor::initTensorInfo(tensorShape, fcDataType, dataLayout);
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "acl_common_executor.hpp"
+#include "nodes/executors/gemm_config.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class ACLGEMMExecutor : public ACLCommonExecutor {
+public:
+    ACLGEMMExecutor(const GEMMAttrs& attrs,
+                      const PostOps& postOps,
+                      const MemoryArgs& memory,
+                      const ExecutorContext::CPtr context);
+
+    static bool supports(const GEMMConfig& config);
+
+    void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override;
+
+    arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override;
+
+    ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override;
+
+    impl_desc_type implType() const override {
+        return impl_desc_type::gemm_acl;
+    }
+
+protected:
+    ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                           const arm_compute::DataType& dataType,
+                           const arm_compute::DataLayout& dataLayout) override;
+
+private:
+    arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
+    arm_compute::GEMMInfo gemmInfo;
+    arm_compute::WeightsInfo weightsInfo;
+};
+
+using ACLGEMMExecutorPtr = std::shared_ptr<ACLGEMMExecutor>;
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp
@@ -17,7 +17,8 @@ struct FCAttrs {
     // @todo probably we don't want with bias flag, since this information is already
     // a part of src memory descs
     bool withBias = false;
-    bool weightsNonTransposed = false;
+    // TODO: why default is false???
+    bool weightsNonTransposed = true;
     bool sparseWeights = false;
     // @todo only memory descriptors should be a part of attributes
     // actual memory should be passed into "execute" or "prepareMemory" calls

diff --git a/src/plugins/intel_cpu/src/nodes/executors/gemm_attrs.hpp b/src/plugins/intel_cpu/src/nodes/executors/gemm_attrs.hpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+
+#include "cpu_memory.h"
+#include "executor_config.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+// @todo require explicit initialization of all the attributes?
+struct GEMMAttrs {
+    // @todo probably we don't want with bias flag, since this information is already
+    // a part of src memory descs
+    bool withBias = false;
+    // TODO: why default is false???
+    bool weightsNonTransposed = true;
+    bool sparseWeights = false;
+    // @todo only memory descriptors should be a part of attributes
+    // actual memory should be passed into "execute" or "prepareMemory" calls
+    std::vector<float> dequantizationScales;
+    // @todo should be passed as an additional memory input?
+    MemoryCPtr decompressionSubtractPtr;
+    MemoryCPtr decompressionMultiplyPtr;
+    uint64_t dynamicQuantizationGroupSize;
+    ov::intel_cpu::Config::ModelType modelType = ov::intel_cpu::Config::ModelType::Unknown;
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/gemm_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/gemm_config.hpp
@@ -0,0 +1,14 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "executor_config.hpp"
+#include "gemm_attrs.hpp"
+
+namespace ov {
+namespace intel_cpu {
+using GEMMConfig = ov::intel_cpu::executor::Config<GEMMAttrs>;
+}  // namespace intel_cpu
+}  // namespace ov