Skip to content

Commit

Permalink
[RISCV64][SHL] Added FC FP32 executor (openvinotoolkit#23964)
Browse files Browse the repository at this point in the history
### Details:
 - *Reused FC RVV from SHL*
- *The PR to SHL dev branch with accuracy fix for FC f32:
openvinotoolkit/shl#3

### Tickets:
 - *N/A*

### TODO:
- [x] Fix `execType: gemm_f32`
- [x] Added wrapper for `csinn_tensor` and `csinn_session` to allocate
these structures and deallocate them


### Prerequisites:
- [x] openvinotoolkit#23901
  • Loading branch information
a-sidorova authored and spran180 committed Jul 27, 2024
1 parent cd7cc25 commit a20bbf7
Show file tree
Hide file tree
Showing 16 changed files with 585 additions and 4 deletions.
9 changes: 9 additions & 0 deletions src/plugins/intel_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ if(OV_CPU_WITH_ACL)
set(CMAKE_CXX_STANDARD 14)
endif()

if (ENABLE_SHL_FOR_CPU)
add_definitions(-DOV_CPU_WITH_SHL)
set(OV_CPU_WITH_SHL ON)
endif()

file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
file(GLOB_RECURSE HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/src/*.h
${CMAKE_CURRENT_SOURCE_DIR}/src/*.hpp)
Expand All @@ -132,6 +137,10 @@ if(NOT OV_CPU_WITH_ACL)
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/acl/*)
endif()

if(NOT OV_CPU_WITH_SHL)
list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/shl/*)
endif()

if(NOT X86_64)
list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/x64/*
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_cpu/src/graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,10 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) {
}

void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) {
#if defined(OV_CPU_WITH_SHL)
return;
#endif

// This optimization allows us to avoid transposing the weights in Transpose node and do it directly along with reordering in FC node
auto& graphNodes = graph.GetNodes();

Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,7 @@ std::string Node::getPrimitiveDescriptorType() const {
SEARCH_TYPE(winograd);
SEARCH_TYPE(sparse);
SEARCH_TYPE(acl);
SEARCH_TYPE(shl);
SEARCH_TYPE(_dw);
SEARCH_TYPE(_1x1);

Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ std::string ExecutorTypeToString(const ExecutorType type) {
CASE(Acl);
CASE(Mlas);
CASE(jit_aarch64);
CASE(Shl);
}
#undef CASE
return "Undefined";
Expand All @@ -35,6 +36,7 @@ ExecutorType ExecutorTypeFromString(const std::string& typeStr) {
CASE(Acl);
CASE(Mlas);
CASE(jit_aarch64);
CASE(Shl);
#undef CASE
return ExecutorType::Undefined;
}
Expand Down
9 changes: 8 additions & 1 deletion src/plugins/intel_cpu/src/nodes/executors/executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ namespace intel_cpu {
# define OV_CPU_INSTANCE_MLAS_X64(...)
#endif

#if defined(OV_CPU_WITH_SHL)
# define OV_CPU_INSTANCE_SHL(...) {__VA_ARGS__},
#else
# define OV_CPU_INSTANCE_SHL(...)
#endif

#define OV_CPU_INSTANCE_COMMON(...) {__VA_ARGS__},

// @todo another option is to determine shape relation by executor type
Expand All @@ -63,7 +69,8 @@ enum class ExecutorType {
Dnnl,
Acl,
Mlas,
jit_aarch64
jit_aarch64,
Shl
};

enum class OperationType {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
#include "ov_optional.hpp"
#include "utils/cpp/maybe_unused.hpp"

#if defined(OV_CPU_WITH_SHL)
# include "nodes/executors/shl/shl_fullyconnected.hpp"
#endif

namespace ov {
namespace intel_cpu {

Expand Down Expand Up @@ -301,6 +305,36 @@ const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
context,
false);
})
OV_CPU_INSTANCE_SHL(
"fullyconnected_shl",
ExecutorType::Shl,
OperationType::FullyConnected,
ShapeTolerance::Agnostic,
// supports
[](const FCConfig& config) -> bool {
VERIFY(noPostOps(config), UNSUPPORTED_POST_OPS);
VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS);
VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION);
VERIFY(everyone_is(f32, srcType(config), weiType(config), dstType(config)), UNSUPPORTED_SRC_PRECISIONS);

return ShlFCExecutor::supports(config);
},
// requiresFallback
[](const FCConfig& config) -> ov::optional<executor::Config<FCAttrs>> {
return {};
},
// acceptsShapes
[](const MemoryArgs& memory) -> bool {
return true;
},
// create
[](const FCAttrs& attrs,
const PostOps& postOps,
const MemoryArgs& memory,
const ExecutorContext::CPtr context) {
return std::make_shared<ShlFCExecutor>(attrs, postOps, memory, context);
}
)
OV_CPU_INSTANCE_DNNL(
"fullyconnected_dnnl",
ExecutorType::Dnnl,
Expand Down
191 changes: 191 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/shl/shl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once

#include "shl_utils.hpp"
#include "csinn/csinn_data_structure.h"
#include "csinn/csinn_runtime.h"

#include "memory_desc/cpu_memory_desc.h"

#include <memory>


namespace ov {
namespace intel_cpu {


template <typename T>
struct ShlStructureTraits {};

template <typename T, typename traits = ShlStructureTraits<T>>
struct ShlStructure {
public:
ShlStructure() = default;
ShlStructure(const ShlStructure<T, traits>&) = default;
ShlStructure(ShlStructure<T, traits>&&) = default;
explicit ShlStructure(T t) { reset(t); }

ShlStructure<T, traits> &operator=(const ShlStructure<T, traits>&) = default;
ShlStructure<T, traits> &operator=(ShlStructure<T, traits>&&) = default;

void reset(T t) {
m_ptr.reset(t, traits::destructor);
}

T get(bool allow_empty = false) const {
T result = m_ptr.get();
OPENVINO_ASSERT(allow_empty || result != nullptr, "ShlStructure is not initialized");
return result;
}

explicit operator T() const {
return get(true);
}

explicit operator bool() const {
return get(true) != nullptr;
}

bool operator==(const ShlStructure<T, traits> &other) const {
return other.m_ptr.get() == m_ptr.get();
}
bool operator!=(const ShlStructure &other) const {
return !(*this == other);
}

private:
std::shared_ptr<typename std::remove_pointer<T>::type> m_ptr = nullptr;

protected:
bool operator==(const T other) const { return other == m_ptr.get(); }
bool operator!=(const T other) const { return !(*this == other); }
};

template <>
struct ShlStructureTraits<csinn_session*> {
static void destructor(csinn_session* p) {
return csinn_free_session(p);
}
};
struct ShlSession : public ShlStructure<csinn_session*> {
ShlSession() {
csinn_session* session = csinn_alloc_session();
OPENVINO_ASSERT(session != nullptr, "Failed to create csinn_session");
// CPU Plugin supports only per layer execution in SHL
session->base_run_mode = CSINN_RM_LAYER;
reset(session);
}
};

template <>
struct ShlStructureTraits<csinn_tensor*> {
static void destructor(csinn_tensor* p) {
return csinn_free_tensor(p);
}
};
struct ShlTensor : public ShlStructure<csinn_tensor*> {
ShlTensor() {
csinn_tensor* tensor = csinn_alloc_tensor(nullptr);
OPENVINO_ASSERT(tensor != nullptr, "Failed to create csinn_tensor");
reset(tensor);
}

ShlTensor(const ShlSession& session) {
csinn_tensor* tensor = csinn_alloc_tensor(session.get());
OPENVINO_ASSERT(tensor != nullptr, "Failed to create csinn_tensor");
reset(tensor);
}

ShlTensor(const ShlSession& session, csinn_dtype_enum data_type, csinn_layout_enum layout, const VectorDims& shape = {}, void* data = nullptr)
: ShlTensor(session) {
setPrecision(data_type);
setLayout(layout);
setShape(shape);
setData(data);
}

ShlTensor(const ShlTensor& another) : ShlTensor() {
csinn_tensor_copy(get(), another.get());
}

csinn_layout_enum getLayout() const {
// csinn_tensor contains `layout` as int32_t
return static_cast<csinn_layout_enum>(get()->layout);
}

csinn_dtype_enum getPrecision() const {
return get()->dtype;
}

VectorDims getShape() const {
VectorDims shape(get()->dim_count);
for (size_t i = 0; i < shape.size(); ++i)
shape[i] = static_cast<size_t>(get()->dim[i]);
return shape;
}

void* getData() const {
return get()->data;
}

void setData(void* data) {
get()->data = data;
}

ShlTensor cloneWithNewShape(const VectorDims& shape) const {
ShlTensor cloned(*this);
cloned.setShape(shape);
return cloned;
}

#ifdef CPU_DEBUG_CAPS
void print() const {
std::cout << "Shape: " << ov::Shape(getShape()) << " "
<< "DataType: " << getPrecision() << " "
<< "Layout: " << getLayout() << " "
<< "Ptr: " << getData() << std::endl;
}
#endif

private:
void setLayout(csinn_layout_enum layout) {
get()->layout = layout;
}

void setPrecision(csinn_dtype_enum data_type) {
get()->dtype = data_type;
}

void setShape(const VectorDims& shape) {
get()->dim_count = shape.size();
OPENVINO_ASSERT(get()->dim_count < MAX_DIM, "Shl supports shapes with rank less or equal to 8");
for (int i = 0; i < get()->dim_count; ++i)
get()->dim[i] = static_cast<int32_t>(shape[i]);
}
};

template <>
struct ShlStructureTraits<csinn_fc_params*> {
static void destructor(csinn_fc_params* p) {
return csinn_free_params(p);
}
};
struct ShlFCParams : public ShlStructure<csinn_fc_params*> {
ShlFCParams() {
csinn_fc_params* params = static_cast<csinn_fc_params*>(csinn_alloc_params(sizeof(csinn_fc_params), nullptr));
OPENVINO_ASSERT(params != nullptr, "Failed to create csinn_fc_params");
reset(params);
}

ShlFCParams(const ShlSession& session, csinn_api_enum api) {
csinn_fc_params* params = static_cast<csinn_fc_params*>(csinn_alloc_params(sizeof(csinn_fc_params), session.get()));
OPENVINO_ASSERT(params != nullptr, "Failed to create csinn_fc_params");
params->base.api = api;
reset(params);
}
};

} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit a20bbf7

Please sign in to comment.