[RISCV64][SHL] Added FC FP32 executor (openvinotoolkit#23964)

### Details: - *Reused FC RVV from SHL* - *The PR to SHL dev branch with accuracy fix for FC f32: openvinotoolkit/shl#3 ### Tickets: - *N/A* ### TODO: - [x] Fix `execType: gemm_f32` - [x] Added wrapper for `csinn_tensor` and `csinn_session` to allocate these structures and deallocate them ### Prerequisites: - [x] openvinotoolkit#23901
spran180 · Jul 27, 2024 · a20bbf7 · a20bbf7
1 parent cd7cc25
commit a20bbf7
Show file tree

Hide file tree

Showing 16 changed files with 585 additions and 4 deletions.
diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt
@@ -123,6 +123,11 @@ if(OV_CPU_WITH_ACL)
     set(CMAKE_CXX_STANDARD 14)
 endif()
 
+if (ENABLE_SHL_FOR_CPU)
+    add_definitions(-DOV_CPU_WITH_SHL)
+    set(OV_CPU_WITH_SHL ON)
+endif()
+
 file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
 file(GLOB_RECURSE HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/src/*.h
                           ${CMAKE_CURRENT_SOURCE_DIR}/src/*.hpp)
@@ -132,6 +137,10 @@ if(NOT OV_CPU_WITH_ACL)
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/acl/*)
 endif()
 
+if(NOT OV_CPU_WITH_SHL)
+    list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/shl/*)
+endif()
+
 if(NOT X86_64)
     list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/x64/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/x64/*

diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp
@@ -949,6 +949,10 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) {
 }
 
 void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) {
+#if defined(OV_CPU_WITH_SHL)
+    return;
+#endif
+
     // This optimization allows us to avoid transposing the weights in Transpose node and do it directly along with reordering in FC node
     auto& graphNodes = graph.GetNodes();
 

diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
@@ -456,6 +456,7 @@ std::string Node::getPrimitiveDescriptorType() const {
     SEARCH_TYPE(winograd);
     SEARCH_TYPE(sparse);
     SEARCH_TYPE(acl);
+    SEARCH_TYPE(shl);
     SEARCH_TYPE(_dw);
     SEARCH_TYPE(_1x1);
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/executor.cpp
@@ -20,6 +20,7 @@ std::string ExecutorTypeToString(const ExecutorType type) {
         CASE(Acl);
         CASE(Mlas);
         CASE(jit_aarch64);
+        CASE(Shl);
     }
 #undef CASE
     return "Undefined";
@@ -35,6 +36,7 @@ ExecutorType ExecutorTypeFromString(const std::string& typeStr) {
     CASE(Acl);
     CASE(Mlas);
     CASE(jit_aarch64);
+    CASE(Shl);
 #undef CASE
     return ExecutorType::Undefined;
 }

diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp
@@ -47,6 +47,12 @@ namespace intel_cpu {
 #    define OV_CPU_INSTANCE_MLAS_X64(...)
 #endif
 
+#if defined(OV_CPU_WITH_SHL)
+#    define OV_CPU_INSTANCE_SHL(...) {__VA_ARGS__},
+#else
+#    define OV_CPU_INSTANCE_SHL(...)
+#endif
+
 #define OV_CPU_INSTANCE_COMMON(...) {__VA_ARGS__},
 
 // @todo another option is to determine shape relation by executor type
@@ -63,7 +69,8 @@ enum class ExecutorType {
     Dnnl,
     Acl,
     Mlas,
-    jit_aarch64
+    jit_aarch64,
+    Shl
 };
 
 enum class OperationType {

diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -26,6 +26,10 @@
 #include "ov_optional.hpp"
 #include "utils/cpp/maybe_unused.hpp"
 
+#if defined(OV_CPU_WITH_SHL)
+#    include "nodes/executors/shl/shl_fullyconnected.hpp"
+#endif
+
 namespace ov {
 namespace intel_cpu {
 
@@ -301,6 +305,36 @@ const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
                     context,
                     false);
             })
+        OV_CPU_INSTANCE_SHL(
+            "fullyconnected_shl",
+            ExecutorType::Shl,
+            OperationType::FullyConnected,
+            ShapeTolerance::Agnostic,
+            // supports
+            [](const FCConfig& config) -> bool {
+                VERIFY(noPostOps(config), UNSUPPORTED_POST_OPS);
+                VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS);
+                VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION);
+                VERIFY(everyone_is(f32, srcType(config), weiType(config), dstType(config)), UNSUPPORTED_SRC_PRECISIONS);
+
+                return ShlFCExecutor::supports(config);
+            },
+            // requiresFallback
+            [](const FCConfig& config) -> ov::optional<executor::Config<FCAttrs>> {
+                return {};
+            },
+            // acceptsShapes
+            [](const MemoryArgs& memory) -> bool {
+                return true;
+            },
+            // create
+            [](const FCAttrs& attrs,
+               const PostOps& postOps,
+               const MemoryArgs& memory,
+               const ExecutorContext::CPtr context) {
+                return std::make_shared<ShlFCExecutor>(attrs, postOps, memory, context);
+            }
+        )
         OV_CPU_INSTANCE_DNNL(
             "fullyconnected_dnnl",
             ExecutorType::Dnnl,

diff --git a/src/plugins/intel_cpu/src/nodes/executors/shl/shl.hpp b/src/plugins/intel_cpu/src/nodes/executors/shl/shl.hpp
@@ -0,0 +1,191 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma once
+
+#include "shl_utils.hpp"
+#include "csinn/csinn_data_structure.h"
+#include "csinn/csinn_runtime.h"
+
+#include "memory_desc/cpu_memory_desc.h"
+
+#include <memory>
+
+
+namespace ov {
+namespace intel_cpu {
+
+
+template <typename T>
+struct ShlStructureTraits {};
+
+template <typename T, typename traits = ShlStructureTraits<T>>
+struct ShlStructure {
+public:
+    ShlStructure() = default;
+    ShlStructure(const ShlStructure<T, traits>&) = default;
+    ShlStructure(ShlStructure<T, traits>&&) = default;
+    explicit ShlStructure(T t) { reset(t); }
+
+    ShlStructure<T, traits> &operator=(const ShlStructure<T, traits>&) = default;
+    ShlStructure<T, traits> &operator=(ShlStructure<T, traits>&&) = default;
+
+    void reset(T t) {
+        m_ptr.reset(t, traits::destructor);
+    }
+
+    T get(bool allow_empty = false) const {
+        T result = m_ptr.get();
+        OPENVINO_ASSERT(allow_empty || result != nullptr, "ShlStructure is not initialized");
+        return result;
+    }
+
+    explicit operator T() const {
+        return get(true);
+    }
+
+    explicit operator bool() const {
+        return get(true) != nullptr;
+    }
+
+    bool operator==(const ShlStructure<T, traits> &other) const {
+        return other.m_ptr.get() == m_ptr.get();
+    }
+    bool operator!=(const ShlStructure &other) const {
+        return !(*this == other);
+    }
+
+private:
+    std::shared_ptr<typename std::remove_pointer<T>::type> m_ptr = nullptr;
+
+protected:
+    bool operator==(const T other) const { return other == m_ptr.get(); }
+    bool operator!=(const T other) const { return !(*this == other); }
+};
+
+template <>
+struct ShlStructureTraits<csinn_session*> {
+    static void destructor(csinn_session* p) {
+        return csinn_free_session(p);
+    }
+};
+struct ShlSession : public ShlStructure<csinn_session*> {
+    ShlSession() {
+        csinn_session* session = csinn_alloc_session();
+        OPENVINO_ASSERT(session != nullptr, "Failed to create csinn_session");
+        // CPU Plugin supports only per layer execution in SHL
+        session->base_run_mode = CSINN_RM_LAYER;
+        reset(session);
+    }
+};
+
+template <>
+struct ShlStructureTraits<csinn_tensor*> {
+    static void destructor(csinn_tensor* p) {
+        return csinn_free_tensor(p);
+    }
+};
+struct ShlTensor : public ShlStructure<csinn_tensor*> {
+    ShlTensor() {
+        csinn_tensor* tensor = csinn_alloc_tensor(nullptr);
+        OPENVINO_ASSERT(tensor != nullptr, "Failed to create csinn_tensor");
+        reset(tensor);
+    }
+
+    ShlTensor(const ShlSession& session) {
+        csinn_tensor* tensor = csinn_alloc_tensor(session.get());
+        OPENVINO_ASSERT(tensor != nullptr, "Failed to create csinn_tensor");
+        reset(tensor);
+    }
+
+    ShlTensor(const ShlSession& session, csinn_dtype_enum data_type, csinn_layout_enum layout, const VectorDims& shape = {}, void* data = nullptr)
+        : ShlTensor(session) {
+        setPrecision(data_type);
+        setLayout(layout);
+        setShape(shape);
+        setData(data);
+    }
+
+    ShlTensor(const ShlTensor& another) : ShlTensor() {
+        csinn_tensor_copy(get(), another.get());
+    }
+
+    csinn_layout_enum getLayout() const {
+        // csinn_tensor contains `layout` as int32_t
+        return static_cast<csinn_layout_enum>(get()->layout);
+    }
+
+    csinn_dtype_enum getPrecision() const {
+        return get()->dtype;
+    }
+
+    VectorDims getShape() const {
+        VectorDims shape(get()->dim_count);
+        for (size_t i = 0; i < shape.size(); ++i)
+            shape[i] = static_cast<size_t>(get()->dim[i]);
+        return shape;
+    }
+
+    void* getData() const {
+        return get()->data;
+    }
+
+    void setData(void* data) {
+        get()->data = data;
+    }
+
+    ShlTensor cloneWithNewShape(const VectorDims& shape) const {
+        ShlTensor cloned(*this);
+        cloned.setShape(shape);
+        return cloned;
+    }
+
+#ifdef CPU_DEBUG_CAPS
+    void print() const {
+        std::cout << "Shape: " << ov::Shape(getShape()) << " "
+                  << "DataType: " << getPrecision() << " "
+                  << "Layout: " << getLayout() << " "
+                  << "Ptr: " << getData() << std::endl;
+    }
+#endif
+
+private:
+    void setLayout(csinn_layout_enum layout) {
+        get()->layout = layout;
+    }
+
+    void setPrecision(csinn_dtype_enum data_type) {
+        get()->dtype = data_type;
+    }
+
+    void setShape(const VectorDims& shape) {
+        get()->dim_count = shape.size();
+        OPENVINO_ASSERT(get()->dim_count < MAX_DIM, "Shl supports shapes with rank less or equal to 8");
+        for (int i = 0; i < get()->dim_count; ++i)
+            get()->dim[i] = static_cast<int32_t>(shape[i]);
+    }
+};
+
+template <>
+struct ShlStructureTraits<csinn_fc_params*> {
+    static void destructor(csinn_fc_params* p) {
+        return csinn_free_params(p);
+    }
+};
+struct ShlFCParams : public ShlStructure<csinn_fc_params*> {
+    ShlFCParams() {
+        csinn_fc_params* params = static_cast<csinn_fc_params*>(csinn_alloc_params(sizeof(csinn_fc_params), nullptr));
+        OPENVINO_ASSERT(params != nullptr, "Failed to create csinn_fc_params");
+        reset(params);
+    }
+
+    ShlFCParams(const ShlSession& session, csinn_api_enum api) {
+        csinn_fc_params* params = static_cast<csinn_fc_params*>(csinn_alloc_params(sizeof(csinn_fc_params), session.get()));
+        OPENVINO_ASSERT(params != nullptr, "Failed to create csinn_fc_params");
+        params->base.api = api;
+        reset(params);
+    }
+};
+
+}   // namespace intel_cpu
+}   // namespace ov