[CPU] [ARM] [INT8] FullyConnected (openvinotoolkit#25171)

### Details: - *[ARM] [INT8] FullyConnected* ### Tickets: - *CVS-149494* --------- Co-authored-by: Aleksandr Voron <[email protected]>
ynimmaga · Dec 18, 2024 · 9ff5942 · 9ff5942
1 parent 87370fe
commit 9ff5942
Show file tree

Hide file tree

Showing 90 changed files with 1,181 additions and 478 deletions.
diff --git a/src/common/low_precision_transformations/src/network_helper.cpp b/src/common/low_precision_transformations/src/network_helper.cpp
@@ -1897,4 +1897,4 @@ bool NetworkHelper::checkConstantNotInf(const std::shared_ptr<Node> constant_nod
 }
 } // namespace low_precision
 } // namespace pass
-} // namespace ov
+} // namespace ov
diff --git a/...mon/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp b/...mon/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp
@@ -24,7 +24,7 @@ ov::pass::ConvertFCToFCQuantizedLegacy::ConvertFCToFCQuantizedLegacy() {
     std::vector<element::Type> weights_types{ov::element::i8};
 
     auto activations_m = pattern::any_input(ov::pass::pattern::type_matches_any(activation_types));
-    auto weights_m = wrap_type<ov::op::v0::Constant>(ov::pass::pattern::type_matches_any(weights_types));
+    auto weights_m = pattern::any_input();
     auto bias_m = pattern::any_input();
 
     auto fully_connected_m = wrap_type<ov::op::internal::FullyConnected>({activations_m, weights_m, bias_m});
@@ -43,7 +43,8 @@ ov::pass::ConvertFCToFCQuantizedLegacy::ConvertFCToFCQuantizedLegacy() {
         const auto& fc_output_shape = fc_output.get_partial_shape();
         const auto& multiply_output_shape = multiply.get_partial_shape();
 
-        if (*fc_output_shape.rbegin() != *multiply_output_shape.rbegin()) {
+        if (*fc_output_shape.rbegin() != *multiply_output_shape.rbegin() ||
+            !ov::op::util::is_on_constant_path(weights)) {
             return false;
         }
 

diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
@@ -13,6 +13,7 @@
 
 #include "cpu_types.h"
 #include "memory_desc/dnnl_blocked_memory_desc.h"
+#include "nodes/executors/common/common_utils.hpp"
 #include "nodes/executors/memory_arguments.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "utils/cpu_utils.hpp"
@@ -21,52 +22,6 @@
 namespace ov {
 namespace intel_cpu {
 
-static std::vector<float> getDeQuantizedScales(const MemoryArgs& memory) {
-    if (!memory.count(ARG_DST_DEQ_SCALE))
-        return {};
-
-    auto scalesMemory = memory.at(ARG_DST_DEQ_SCALE);
-
-    auto scalesData = static_cast<const float*>(scalesMemory->getData());
-
-    if (!scalesData)
-        return {};
-
-    auto dstShape = memory.at(ARG_DST)->getShape();
-    auto dqScalesShape = scalesMemory->getShape();
-
-    auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size());
-
-    auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies<size_t>());
-
-    std::vector<float> DQScales(scaleSize, 1.0);
-
-    OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize,
-                    "set invalid scales size , DQScales vector size: ",
-                    DQScales.size(),
-                    ", scale data size: ",
-                    scaleSize);
-
-    // @todo do we really need to broadcast dq scales and then resize them back?
-    if (scaleSize > DQScales.size())
-        DQScales.resize(scaleSize, DQScales[0]);
-    if (1 == scaleSize) {
-        std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) {
-            return (scalesData[0] * val);
-        });
-    } else {
-        for (size_t i = 0; i < DQScales.size(); i++) {
-            DQScales[i] *= scalesData[i];
-        }
-    }
-    if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) {
-            return (val == DQScales[0]);
-        }))
-        DQScales.resize(1);
-
-    return DQScales;
-}
-
 DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps,
                                          const dnnl::engine& engine,
                                          const VectorDims& outputDims,

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
@@ -11,14 +11,13 @@
 namespace ov {
 namespace intel_cpu {
 
-static const std::unordered_map<int, ACLArgs> argConvert = {
-    {ARG_SRC_0, ACL_SRC_0},
-    {ARG_SRC_1, ACL_SRC_1},
-    {ARG_SRC_2, ACL_SRC_2},
-    {ARG_BIAS, ACL_BIAS},
-    {ARG_WEI, ACL_WEI},
-    {ARG_DST, ACL_DST},
-};
+static const std::unordered_map<int, ACLArgs> argConvert = {{ARG_SRC_0, ACL_SRC_0},
+                                                            {ARG_SRC_1, ACL_SRC_1},
+                                                            {ARG_SRC_2, ACL_SRC_2},
+                                                            {ARG_BIAS, ACL_BIAS},
+                                                            {ARG_WEI, ACL_WEI},
+                                                            {ARG_DST, ACL_DST},
+                                                            {ARG_DST_DEQ_SCALE, ACL_DST_DEQ_SCALE}};
 
 using ACLTypes = std::array<arm_compute::DataType, ACLArgs::COUNT_OF_ARGS>;
 using ACLLayouts = std::array<arm_compute::DataLayout, ACLArgs::COUNT_OF_ARGS>;
@@ -39,9 +38,9 @@ static void initACLTensorParams(const MemoryPtr& memoryPtr,
     }
 }
 
-static std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
-                                                               const arm_compute::DataType& dataType,
-                                                               const arm_compute::DataLayout& dataLayout) {
+std::shared_ptr<arm_compute::TensorInfo> ACLCommonExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                                                           const arm_compute::DataType& dataType,
+                                                                           const arm_compute::DataLayout& dataLayout) {
     std::shared_ptr<arm_compute::TensorInfo> aclMemoryInfo = nullptr;
     if (dataType != arm_compute::DataType::UNKNOWN) {
         aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>(tensorShape, 1, dataType, dataLayout);
@@ -70,6 +69,9 @@ bool ACLCommonExecutor::update(const MemoryArgs& memory) {
     ACLTypes aclDataType{};
     ACLLayouts aclDataLayout{};
     for (auto& cpu_mem_ptr : memory) {
+        if (cpu_mem_ptr.second->getSize() == 0) {
+            continue;
+        }
         const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
         initACLTensorParams(cpu_mem_ptr.second,
                             aclTensorAttrs,

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp
@@ -11,7 +11,7 @@
 namespace ov {
 namespace intel_cpu {
 
-enum ACLArgs { ACL_SRC_0, ACL_SRC_1, ACL_SRC_2, ACL_BIAS, ACL_WEI, ACL_DST, COUNT_OF_ARGS };
+enum ACLArgs { ACL_SRC_0, ACL_SRC_1, ACL_SRC_2, ACL_BIAS, ACL_WEI, ACL_DST, ACL_DST_DEQ_SCALE, COUNT_OF_ARGS };
 
 using ACLFunction = std::unique_ptr<arm_compute::IFunction>;
 using ACLShapes = std::array<arm_compute::TensorShape, ACLArgs::COUNT_OF_ARGS>;
@@ -42,6 +42,9 @@ class ACLCommonExecutor : public Executor {
 
 protected:
     ACLTensorAttrs aclTensorAttrs;
+    virtual std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                                                                    const arm_compute::DataType& dataType,
+                                                                    const arm_compute::DataLayout& dataLayout);
 
 private:
     ACLTensors aclMemoryTensors;