Performance fixes (openvinotoolkit#116)

alvoron · Apr 5, 2023 · a4c7e14 · a4c7e14
1 parent e3afabc
commit a4c7e14
Show file tree

Hide file tree

Showing 33 changed files with 3,607 additions and 2,943 deletions.
diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
@@ -246,5 +246,36 @@ dnnl::algorithm DnnlExtensionUtils::convertToDnnlAlgorithm(Algorithm alg) {
     }
 }
 
+bool DnnlExtensionUtils::isUnarySupportedAsPostOp(Algorithm alg) {
+#if defined(OV_CPU_WITH_ACL)
+    return one_of(alg, Algorithm::EltwiseRelu,
+                       Algorithm::EltwiseTanh,
+                       Algorithm::EltwiseElu,
+                       Algorithm::EltwiseAbs,
+                       Algorithm::EltwiseSqrt,
+                       Algorithm::EltwiseSoftRelu,
+                       Algorithm::EltwiseSigmoid);
+#elif defined(OPENVINO_ARCH_X86_64)
+    return one_of(alg, Algorithm::EltwiseRelu,
+                       Algorithm::EltwiseGeluErf,
+                       Algorithm::EltwiseGeluTanh,
+                       Algorithm::EltwiseElu,
+                       Algorithm::EltwiseSigmoid,
+                       Algorithm::EltwiseClamp,
+                       Algorithm::EltwiseTanh,
+                       Algorithm::EltwiseSwish,
+                       Algorithm::EltwiseHswish,
+                       Algorithm::EltwiseMish,
+                       Algorithm::EltwiseHsigmoid,
+                       Algorithm::EltwiseRoundHalfToEven,
+                       Algorithm::EltwiseRoundHalfAwayFromZero,
+                       Algorithm::EltwiseAbs,
+                       Algorithm::EltwiseSqrt,
+                       Algorithm::EltwiseSoftRelu);
+#else
+    return false;
+#endif
+}
+
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.h b/src/plugins/intel_cpu/src/dnnl_extension_utils.h
@@ -58,6 +58,7 @@ class DnnlExtensionUtils {
     static dnnl_memory_desc_t clone_desc(const_dnnl_memory_desc_t cdesc);
     static const char* query_pd_info(const_dnnl_primitive_desc_t pd);
     static dnnl::algorithm convertToDnnlAlgorithm(Algorithm alg);
+    static bool isUnarySupportedAsPostOp(Algorithm alg);
 };
 
 }   // namespace intel_cpu

diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp
@@ -988,22 +988,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &grap
             continue;
         }
 
-        if (!one_of(fuseCandidate->getAlgorithm(), Algorithm::EltwiseRelu,
-                                                   Algorithm::EltwiseGeluErf,
-                                                   Algorithm::EltwiseGeluTanh,
-                                                   Algorithm::EltwiseElu,
-                                                   Algorithm::EltwiseSigmoid,
-                                                   Algorithm::EltwiseClamp,
-                                                   Algorithm::EltwiseTanh,
-                                                   Algorithm::EltwiseSwish,
-                                                   Algorithm::EltwiseHswish,
-                                                   Algorithm::EltwiseMish,
-                                                   Algorithm::EltwiseHsigmoid,
-                                                   Algorithm::EltwiseRoundHalfToEven,
-                                                   Algorithm::EltwiseRoundHalfAwayFromZero,
-                                                   Algorithm::EltwiseAbs,
-                                                   Algorithm::EltwiseSqrt,
-                                                   Algorithm::EltwiseSoftRelu)) {
+        if (!DnnlExtensionUtils::isUnarySupportedAsPostOp(fuseCandidate->getAlgorithm())) {
             parent++;
             continue;
         }
@@ -1176,17 +1161,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph)
 
     auto isFusingSupported = [&](NodePtr conv, NodePtr child) {
         return child->getType() == Type::Eltwise &&
-                one_of(child->getAlgorithm(), Algorithm::EltwiseRelu,
-                                              Algorithm::EltwiseElu,
-                                              Algorithm::EltwiseSigmoid,
-                                              Algorithm::EltwiseClamp,
-                                              Algorithm::EltwiseSwish,
-                                              Algorithm::EltwiseHswish,
-                                              Algorithm::EltwiseMish,
-                                              Algorithm::EltwiseHsigmoid,
-                                              Algorithm::EltwiseRoundHalfToEven,
-                                              Algorithm::EltwiseRoundHalfAwayFromZero,
-                                              Algorithm::EltwiseSoftRelu);
+            DnnlExtensionUtils::isUnarySupportedAsPostOp(child->getAlgorithm());
     };
 
     for (auto &graphNode : graphNodes) {

diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
@@ -960,6 +960,9 @@ void Node::cleanup() {
 const std::vector<impl_desc_type>& Node::getPrimitivesPriority() {
     std::vector<impl_desc_type> priorities = {
             impl_desc_type::unknown,
+            // Undef impl type is used to express use-cases there real type is unkown during compilation
+            // Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties
+            impl_desc_type::undef,
             impl_desc_type::brgconv_avx512_amx_1x1,
             impl_desc_type::brgconv_avx512_amx,
             impl_desc_type::jit_avx512_amx_dw,
@@ -989,6 +992,7 @@ const std::vector<impl_desc_type>& Node::getPrimitivesPriority() {
             impl_desc_type::gemm_avx2,
             impl_desc_type::gemm_avx,
             impl_desc_type::gemm_sse42,
+            impl_desc_type::acl,
             impl_desc_type::jit_gemm,
             impl_desc_type::ref_any,
             impl_desc_type::ref,
@@ -1341,6 +1345,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ngraph::Node>& op, const
 }
 
 bool Node::canBePerformedAsScaleShift(const Node *parentNode) const {
+#if defined(OPENVINO_ARCH_X86_64)
     IE_ASSERT(parentNode);
 
     size_t fusingPort = 0;
@@ -1391,6 +1396,10 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const {
                                    Algorithm::EltwisePrelu,
                                    Algorithm::EltwiseMulAdd) && isBroadcastableToDataInput())
             || isConvertablePowerStatic();
+#else
+    // TODO: provide correct list of operations for other backends
+    return false;
+#endif
 }
 
 // @todo shifts for Subtract and scales for Divide are replaced with
@@ -1607,23 +1616,7 @@ bool Node::canFuseSimpleOperation(const NodePtr& node) const {
         }
         return ret;
     } else if (node->getType() == Type::Eltwise) {
-        return one_of(node->getAlgorithm(),
-                      Algorithm::EltwiseRelu,
-                      Algorithm::EltwiseGeluErf,
-                      Algorithm::EltwiseGeluTanh,
-                      Algorithm::EltwiseElu,
-                      Algorithm::EltwiseSigmoid,
-                      Algorithm::EltwiseClamp,
-                      Algorithm::EltwiseTanh,
-                      Algorithm::EltwiseSwish,
-                      Algorithm::EltwiseHswish,
-                      Algorithm::EltwiseMish,
-                      Algorithm::EltwiseHsigmoid,
-                      Algorithm::EltwiseRoundHalfToEven,
-                      Algorithm::EltwiseRoundHalfAwayFromZero,
-                      Algorithm::EltwiseAbs,
-                      Algorithm::EltwiseSqrt,
-                      Algorithm::EltwiseSoftRelu) ||
+        return DnnlExtensionUtils::isUnarySupportedAsPostOp(node->getAlgorithm()) ||
             node->canBePerformedAsScaleShift(this);
     }
     return false;

diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp
@@ -327,6 +327,7 @@ InferenceEngine::Precision Convolution::fusedEltwisePrecision(const NodePtr& fus
 const std::vector<impl_desc_type>& Convolution::getPrimitivesPriority() {
     std::vector<impl_desc_type> priorities = {
         impl_desc_type::unknown,
+        impl_desc_type::dw_acl,
         impl_desc_type::winograd_acl,
         impl_desc_type::gemm_acl,
         impl_desc_type::brgconv_avx512_amx_1x1,

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
@@ -10,19 +10,7 @@ namespace intel_cpu {
 
 using namespace arm_compute;
 
-TensorShape eltwiseShapeCast(const VectorDims &dims) {
-    arm_compute::TensorShape tensorShape;
-    for (std::size_t i = 0; i < dims.size(); ++i) {
-        tensorShape.set(dims.size() - i - 1, dims[i], false);
-    }
-    if (tensorShape.num_dimensions() == 0) {
-        tensorShape.set(0, 1, false);
-        tensorShape.set_num_dimensions(1);
-    }
-    return tensorShape;
-}
-
-VectorDims reshape_sizes(VectorDims dims) {
+inline VectorDims reshape_sizes(VectorDims dims) {
     const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS;
     VectorDims result_dims(MAX_NUM_SHAPE - 1);
     if (dims.size() >= MAX_NUM_SHAPE) {
@@ -46,29 +34,56 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
     if (!postOps.empty()) { return false; }
     aclEltwiseAttrs = eltwiseAttrs;
 
+    std::vector<arm_compute::TensorShape> srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size());
+    std::vector<arm_compute::DataLayout> srcDataLayout(srcDescs.size()), dstDataLayout(dstDescs.size());
+    std::vector<arm_compute::TensorInfo> srcTensorsInfo(srcDescs.size()), dstTensorsInfo(dstDescs.size());
+    srcTensors = std::vector<arm_compute::Tensor>(srcDescs.size());
+    dstTensors = std::vector<arm_compute::Tensor>(dstDescs.size());
+
+    for (int i = 0; i < srcVecDims.size(); i++) {
+        srcVecDims[i] = shapeCast(reshape_sizes(srcDescs[i]->getShape().getDims()));
+    }
+    for (int i = 0; i < dstVecDims.size(); i++) {
+        dstVecDims[i] = shapeCast(reshape_sizes(dstDescs[i]->getShape().getDims()));
+    }
+
+    for (int i = 0; i < srcDescs.size(); i++) {
+        srcDataLayout[i] = getAclDataLayoutByMemoryDesc(srcDescs[i]);
+    }
+    for (int i = 0; i < dstDescs.size(); i++) {
+        dstDataLayout[i] = getAclDataLayoutByMemoryDesc(dstDescs[i]);
+    }
+
     if (srcDescs.size() == 2 &&
+        srcDescs[0]->hasLayoutType(LayoutType::nspc) && srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
         srcDescs[0]->getShape().getDims() != srcDescs[1]->getShape().getDims()) {
-        return false;
+        auto dim_size = srcDescs[0]->getShape().getDims().size();
+        auto mover = [&dim_size](TensorShape &_shape) {
+            if (dim_size == 5) { std::swap(_shape[2], _shape[3]); }
+            std::swap(_shape[1], _shape[2]);
+            std::swap(_shape[0], _shape[1]);
+        };
+        if (dim_size < 5) {
+            srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCHW;
+        } else {
+            srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCDHW;
+        }
+        mover(srcVecDims[0]);
+        mover(srcVecDims[1]);
+        mover(dstVecDims[0]);
     }
 
-    std::vector<VectorDims> srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size());
-    std::vector<TensorInfo> srcTensorsInfo(srcDescs.size()), dstTensorsInfo(dstDescs.size());
-    srcTensors = std::vector<arm_compute::Tensor>(srcDescs.size());
-    dstTensors = std::vector<arm_compute::Tensor>(dstDescs.size());
-
     for (int i = 0; i < srcVecDims.size(); i++) {
-        srcVecDims[i] = reshape_sizes(srcDescs[i]->getShape().getDims());
-        srcTensorsInfo[i] = TensorInfo(eltwiseShapeCast(srcVecDims[i]), 1,
+        srcTensorsInfo[i] = TensorInfo(srcVecDims[i], 1,
                                        precisionToAclDataType(srcDescs[i]->getPrecision()),
-                                       getAclDataLayoutByMemoryDesc(srcDescs[i]));
+                                       srcDataLayout[i]);
         srcTensors[i].allocator()->init(srcTensorsInfo[i]);
     }
 
     for (int i = 0; i < dstVecDims.size(); i++) {
-        dstVecDims[i] = reshape_sizes(dstDescs[i]->getShape().getDims());
-        dstTensorsInfo[i] = TensorInfo(eltwiseShapeCast(dstVecDims[i]), 1,
+        dstTensorsInfo[i] = TensorInfo(dstVecDims[i], 1,
                                        precisionToAclDataType(dstDescs[i]->getPrecision()),
-                                       getAclDataLayoutByMemoryDesc(dstDescs[i]));
+                                       dstDataLayout[i]);
         dstTensors[i].allocator()->init(dstTensorsInfo[i]);
     }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp
@@ -10,8 +10,6 @@
 namespace ov {
 namespace intel_cpu {
 
-arm_compute::TensorShape eltwiseShapeCast(const VectorDims& dims);
-
 class AclEltwiseExecutor : public EltwiseExecutor {
 public:
     AclEltwiseExecutor(const ExecutorContext::CPtr context);