Added Reduce executors (openvinotoolkit#98)

allnes · Mar 3, 2023 · b1daeeb · b1daeeb
1 parent 2deed7a
commit b1daeeb
Show file tree

Hide file tree

Showing 19 changed files with 849 additions and 33 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_matmul.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_matmul.cpp
@@ -2,32 +2,14 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "acl_utils.hpp"
 #include "acl_matmul.hpp"
 
 namespace ov {
 namespace intel_cpu {
 
 using namespace arm_compute;
 
-TensorShape shapeCast(const VectorDims& dims) {
-    arm_compute::TensorShape tensorShape;
-    for (std::size_t i = 0; i < dims.size(); ++i) {
-        tensorShape.set(dims.size() - i - 1, dims[i], false);
-    }
-    if (tensorShape.num_dimensions() == 0) {
-        tensorShape.set(0, 1, false);
-        tensorShape.set_num_dimensions(1);
-    }
-    return tensorShape;
-}
-
-inline Dim vectorProduct(const VectorDims& vec, size_t size) {
-    Dim prod = 1;
-    for (size_t i = 0; i < size; ++i)
-        prod *= vec[i];
-    return prod;
-}
-
 AclMatMulExecutor::AclMatMulExecutor(const ExecutorContext::CPtr context) : MatMulExecutor(context) {}
 
 bool AclMatMulExecutor::init(const MatMulAttrs& matmulAttrs,

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_matmul.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_matmul.hpp
@@ -10,8 +10,6 @@
 namespace ov {
 namespace intel_cpu {
 
-arm_compute::TensorShape shapeCast(const VectorDims& dims);
-
 class AclMatMulExecutor : public MatMulExecutor {
 public:
     AclMatMulExecutor(const ExecutorContext::CPtr context);

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_utils.hpp"
+#include "acl_reduce.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+using namespace arm_compute;
+
+arm_compute::ReductionOperation getAclReductionOperationByAlgorithm(Algorithm algorithm) {
+    switch (algorithm) {
+        case Algorithm::ReduceMax:  return arm_compute::ReductionOperation::MAX;
+        case Algorithm::ReduceMin:  return arm_compute::ReductionOperation::MIN;
+        case Algorithm::ReduceSum:  return arm_compute::ReductionOperation::SUM;
+        case Algorithm::ReduceProd: return arm_compute::ReductionOperation::PROD;
+        default:                    IE_THROW() << "Unsupported reduction operation: " << static_cast<int>(algorithm);
+    }
+}
+
+AclReduceExecutor::AclReduceExecutor(const ExecutorContext::CPtr context) : ReduceExecutor(context) {}
+
+bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs,
+                          const std::vector<MemoryDescPtr>& srcDescs,
+                          const std::vector<MemoryDescPtr>& dstDescs,
+                          const dnnl::primitive_attr &attr) {
+    if (reduceAttrs.operation != Algorithm::ReduceMax &&
+        reduceAttrs.operation != Algorithm::ReduceMin &&
+        reduceAttrs.operation != Algorithm::ReduceSum &&
+        reduceAttrs.operation != Algorithm::ReduceProd &&
+        reduceAttrs.operation != Algorithm::ReduceMean) {
+            return false;
+        }
+
+    this->reduceAttrs = reduceAttrs;
+
+    auto srcDims = srcDescs[0]->getShape().getStaticDims();
+    auto dstDims = dstDescs[0]->getShape().getStaticDims();
+
+    TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
+    precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
+    TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
+    precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
+
+    srcTensor.allocator()->init(srcTensorInfo);
+    dstTensor.allocator()->init(dstTensorInfo);
+
+    switch (reduceAttrs.operation) {
+        case Algorithm::ReduceMean:
+            for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) {
+                auto pos = axisCast(i, reduceAttrs.axes.size());
+                axesMean.set(pos, reduceAttrs.axes[i]);
+            }
+            if (!arm_compute::NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo)) {
+                return false;
+            }
+            exec_func = [this]{
+                auto acl_op = std::make_unique<arm_compute::NEReduceMean>();
+                acl_op->configure(&srcTensor, axesMean, this->reduceAttrs.keepDims, &dstTensor);
+                acl_op->run();
+            };
+            break;
+        case Algorithm::ReduceMax:
+        case Algorithm::ReduceMin:
+        case Algorithm::ReduceSum:
+        case Algorithm::ReduceProd:
+            if (reduceAttrs.axes.size() != 1) {
+                return false;
+            }
+            if (!arm_compute::NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, axisCast(reduceAttrs.axes[0], srcDims.size()),
+                                                            getAclReductionOperationByAlgorithm(reduceAttrs.operation), reduceAttrs.keepDims)) {
+                return false;
+            }
+            exec_func = [this, srcDims]{
+                auto acl_op = std::make_unique<arm_compute::NEReductionOperation>();
+                acl_op->configure(&srcTensor, &dstTensor, axisCast(this->reduceAttrs.axes[0], srcDims.size()),
+                                    getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), this->reduceAttrs.keepDims);
+                acl_op->run();
+            };
+            break;
+        default:
+            IE_THROW() << "Unsupported operation type for ACL Reduce executor: " << static_cast<int>(reduceAttrs.operation);
+    }
+
+    return true;
+}
+
+void AclReduceExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, std::unordered_map<int, MemoryPtr> postOpsArgs) {
+    srcTensor.allocator()->import_memory(src[0]->GetPtr());
+    dstTensor.allocator()->import_memory(dst[0]->GetPtr());
+
+    exec_func();
+
+    srcTensor.allocator()->free();
+    dstTensor.allocator()->free();
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp
@@ -0,0 +1,60 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+// TODO: remove relative path
+#include "../reduce.hpp"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+namespace ov {
+namespace intel_cpu {
+
+class AclReduceExecutor : public ReduceExecutor {
+public:
+    AclReduceExecutor(const ExecutorContext::CPtr context);
+
+    bool init(const ReduceAttrs& reduceAttrs,
+              const std::vector<MemoryDescPtr>& srcDescs,
+              const std::vector<MemoryDescPtr>& dstDescs,
+              const dnnl::primitive_attr &attr) override;
+    void exec(const std::vector<MemoryCPtr>& src,
+              const std::vector<MemoryPtr>& dst,
+              std::unordered_map<int, MemoryPtr> postOpsArgs) override;
+
+    impl_desc_type getImplType() const override {
+        return implType;
+    }
+
+private:
+    std::function<void()> exec_func;
+    ReduceAttrs reduceAttrs;
+    impl_desc_type implType = impl_desc_type::acl;
+
+    arm_compute::Coordinates axesMean;
+    arm_compute::Tensor srcTensor;
+    arm_compute::Tensor dstTensor;
+};
+
+class AclReduceExecutorBuilder : public ReduceExecutorBuilder {
+public:
+    bool isSupported(const ReduceAttrs& reduceAttrs,
+                     const std::vector<MemoryDescPtr>& srcDescs,
+                     const std::vector<MemoryDescPtr>& dstDescs) const override {
+        if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision() ||
+           (srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
+            dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
+            dstDescs[0]->getPrecision() != InferenceEngine::Precision::I32))
+            return false;
+
+        return true;
+    }
+
+    ReduceExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
+        return std::make_shared<AclReduceExecutor>(context);
+    }
+};
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
@@ -3,9 +3,41 @@
 //
 #pragma once
 
+#include "ie_precision.hpp"
+#include "memory_desc/cpu_memory_desc.h"
+#include "arm_compute/core/Types.h"
+
 namespace ov {
 namespace intel_cpu {
 
+/**
+* @brief Return ComputeLibrary TensorShape with reverted layout schema used in ACL 
+* @param dims vector of dimensions to convert
+* @return ComputeLibrary TensorShape object
+*/
+inline arm_compute::TensorShape shapeCast(const VectorDims& dims) {
+    arm_compute::TensorShape tensorShape;
+    for (std::size_t i = 0; i < dims.size(); ++i) {
+        tensorShape.set(dims.size() - i - 1, dims[i], false);
+    }
+    if (tensorShape.num_dimensions() == 0) {
+        tensorShape.set(0, 1, false);
+        tensorShape.set_num_dimensions(1);
+    }
+    return tensorShape;
+}
+
+inline std::size_t axisCast(const std::size_t axis, const std::size_t shapeSize) {
+    return shapeSize - axis - 1;
+}
+
+inline Dim vectorProduct(const VectorDims& vec, size_t size) {
+    Dim prod = 1;
+    for (size_t i = 0; i < size; ++i)
+        prod *= vec[i];
+    return prod;
+}
+
 /**
 * @brief Return ComputeLibrary DataType that corresponds to the given precision
 * @param precision precision to be converted
@@ -36,8 +68,8 @@ inline arm_compute::DataType precisionToAclDataType(InferenceEngine::Precision p
 inline arm_compute::DataLayout getAclDataLayoutByMemoryDesc(MemoryDescCPtr desc) {
     if (desc->hasLayoutType(LayoutType::ncsp)) {
         if (desc->getShape().getRank() == 4) return arm_compute::DataLayout::NCHW;
-        if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW; 
-    } else if(desc->hasLayoutType(LayoutType::nspc)) {
+        if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW;
+    } else if (desc->hasLayoutType(LayoutType::nspc)) {
         if (desc->getShape().getRank() == 4) return arm_compute::DataLayout::NHWC;
         if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NDHWC;
     }