Skip to content

Commit

Permalink
[CPU] [ARM] GEMM: int8 support
Browse files Browse the repository at this point in the history
  • Loading branch information
eshoguli committed Aug 10, 2024
1 parent 9260157 commit 5e23e30
Show file tree
Hide file tree
Showing 10 changed files with 497 additions and 14 deletions.
101 changes: 101 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_gemm.hpp"

#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"

#include "nodes/executors/executor.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "utils/debug_capabilities.h"
#include "nodes/executors/debug_messages.hpp"
#include "nodes/executors/implementation_utils.hpp"

namespace ov {
namespace intel_cpu {

ACLGEMMExecutor::ACLGEMMExecutor(const GEMMAttrs &attrs,
const PostOps &postOps,
const MemoryArgs &memory,
const ExecutorContext::CPtr context) {
aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
OPENVINO_ASSERT(!attrs.transpose_a && !attrs.transpose_b, "not supported");
}

bool ACLGEMMExecutor::supports(const GEMMConfig &config) {
// TODO: check weights layout
const auto attrs = static_cast<GEMMAttrs>(config.attrs);
if (attrs.transpose_a || attrs.transpose_b) {
return false;
}
if (std::any_of(
attrs.dequantizationScales.begin(),
attrs.dequantizationScales.end(),
[](float value) { return value != 1.f;})) {
return false;
}

const auto src1_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_SRC))->getBlockDims();
const auto src2_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_WEI))->getBlockDims();

const auto precision = srcType(config);
VERIFY(one_of(precision, ov::element::i8, ov::element::u8), UNSUPPORTED_SRC_PRECISIONS);
VERIFY(postOpsNumbers(config) == 0, UNSUPPORTED_NUMBER_OF_POSTOPS);
VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK);
VERIFY(static_cast<GEMMAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
return true;
}

void ACLGEMMExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) {
// std::swap(aclMemoryShapes[ACLArgs::ACL_SRC_0][0], aclMemoryShapes[ACLArgs::ACL_SRC_0][1]);
// std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]);
// std::swap(aclMemoryShapes[ACLArgs::ACL_DST][0], aclMemoryShapes[ACLArgs::ACL_DST][1]);
}

arm_compute::Status ACLGEMMExecutor::validateTensorsInfo(const ACLInfos & aclMemoryInfos) {
const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
aclMemoryInfos[ACLArgs::ACL_WEI].get(),
nullptr, //aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
aclMemoryInfos[ACLArgs::ACL_DST].get(),
gemmInfo);
return matMulValid;
}

ACLFunction ACLGEMMExecutor::configureFunction(const ACLTensors & aclMemoryTensors) {
auto gemm = std::make_unique<arm_compute::NEGEMMLowpMatrixMultiplyCore>();
gemm->configure(
aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
aclMemoryTensors[ACLArgs::ACL_WEI].get(),
nullptr, //aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
aclMemoryTensors.at(ACLArgs::ACL_DST).get(),
gemmInfo);
return gemm;
}

std::shared_ptr<arm_compute::TensorInfo> ACLGEMMExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) {
arm_compute::DataType result;
switch (dataType) {
case arm_compute::DataType::S8: {
result = arm_compute::DataType::QASYMM8_SIGNED;
break;
}
case arm_compute::DataType::U8: {
result = arm_compute::DataType::QASYMM8;
break;
}
default: {
result = dataType;
break;
}
}

return ACLCommonExecutor::initTensorInfo(tensorShape, result, dataLayout);
}

} // namespace intel_cpu
} // namespace ov
46 changes: 46 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "acl_common_executor.hpp"
#include "nodes/executors/gemm_config.hpp"

namespace ov {
namespace intel_cpu {

class ACLGEMMExecutor : public ACLCommonExecutor {
public:
ACLGEMMExecutor(const GEMMAttrs& attrs,
const PostOps& postOps,
const MemoryArgs& memory,
const ExecutorContext::CPtr context);

static bool supports(const GEMMConfig& config);

void updateTensorsShapes(ACLShapes& aclMemoryShapes) override;

arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;

ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;

impl_desc_type implType() const override {
return impl_desc_type::gemm_acl;
}

protected:
std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) override;

private:
arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
arm_compute::GEMMInfo gemmInfo;
arm_compute::WeightsInfo weightsInfo;
};

using ACLGEMMExecutorPtr = std::shared_ptr<ACLGEMMExecutor>;

} // namespace intel_cpu
} // namespace ov
25 changes: 25 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/gemm_attrs.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <vector>

#include "cpu_memory.h"
#include "executor_config.hpp"

namespace ov {
namespace intel_cpu {

struct GEMMAttrs {
bool withBias = false;
bool transpose_a = false;
bool transpose_b = false;
bool sparseWeights = false;
std::vector<float> dequantizationScales;
ov::intel_cpu::Config::ModelType modelType = ov::intel_cpu::Config::ModelType::Unknown;
};

} // namespace intel_cpu
} // namespace ov
14 changes: 14 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/gemm_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "executor_config.hpp"
#include "gemm_attrs.hpp"

namespace ov {
namespace intel_cpu {
using GEMMConfig = ov::intel_cpu::executor::Config<GEMMAttrs>;
} // namespace intel_cpu
} // namespace ov
179 changes: 179 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/gemm_implementations.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <memory>
#include <vector>

#include "cpu/x64/cpu_isa_traits.hpp"
#include "debug_messages.hpp"
#include "implementation_utils.hpp"
#include "memory_desc/cpu_memory_desc.h"
#include "nodes/executors/gemm_config.hpp"
#include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp"
#include "nodes/executors/dnnl/dnnl_fullyconnected.hpp"
#include "nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp"
#include "nodes/executors/executor.hpp"
#include "nodes/executors/executor_implementation.hpp"
#include "nodes/executors/implementations.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "nodes/executors/precision_matcher.hpp"
#include "nodes/executors/precision_translation.hpp"
#include "nodes/executors/type_mask.hpp"
#include "openvino/core/type/element_type.hpp"
#include "ov_optional.hpp"
#include "utils/cpp/maybe_unused.hpp"

#if defined(OV_CPU_WITH_ACL)
#include "nodes/executors/acl/acl_fullyconnected.hpp"
#include "nodes/executors/acl/acl_gemm.hpp"
#endif

namespace ov {
namespace intel_cpu {

using namespace ov::element;
using namespace TypeMaskAlias;
using namespace executor;

static const MappingNotation dnnlFCMappingNotation{ARG_SRC, ARG_WEI, ARG_BIAS, ARG_DST};

using LayoutConfig = std::vector<LayoutType>;
static const LayoutConfig dnnlFCLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp};
static const LayoutConfig aclFCLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp};
static const LayoutConfig aclMatMulLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp};

template<dnnl::impl::cpu::x64::cpu_isa_t ISA>
struct Require {
bool operator()() {
return dnnl::impl::cpu::x64::mayiuse(ISA);
}
};

static const TypeMapping aclMatMulTypeMapping {
// {src, wei, bia, dst} pt<src, wei, bias, dst>
{{_i8, _i8, _any, _any}, pt(just<i8>(), just<i8>(), just<i32>(), just<i32>())},
{{_u8, _u8, _any, _any}, pt(just<u8>(), just<u8>(), just<i32>(), just<i32>())},
{{_any, _any, _any, _any}, pt(just<f32>(), just<f32>(), just<f32>(), just<f32>())}
};

static const MappingNotation aclMatMulMappingNotation {
ARG_SRC, ARG_WEI, ARG_BIAS, ARG_DST
};

// clang-format on

static bool fullyMatchConfiguration(const MemoryDescArgs& currentDescriptors,
const InOutTypes& typeConfig,
const LayoutConfig& layoutConfig,
const MappingNotation& notation) {
for (size_t i = 0; i < typeConfig.size(); i++) {
const auto& type = typeConfig[i];
const auto& desc = currentDescriptors.at(notation[i]);

if (desc->empty())
continue;

if (desc->getPrecision() != type)
return false; // type mismatch

if (!desc->hasLayoutType(layoutConfig[i]))
return false; // layout mismatch
}

return true;
}

static MemoryDescArgs createOptimalDescriptors(const MemoryDescArgs& currentDescriptors,
const InOutTypes& typeConfig,
const LayoutConfig& layoutConfig,
const MappingNotation& notation) {
MemoryDescArgs descs = currentDescriptors;

const auto& creatorsMap = BlockedDescCreator::getCommonCreators();
for (size_t i = 0; i < typeConfig.size(); i++) {
const auto& desc = currentDescriptors.at(notation[i]);
const auto& descType = desc->getPrecision();
const auto& type = typeConfig[i];
const auto& layout = layoutConfig[i];

if (desc->empty())
continue;

if (descType == type && desc->hasLayoutType(layout)) {
continue;
}

descs[notation[i]] = creatorsMap.at(layout)->createSharedDesc(type, desc->getShape());
}

return descs;
}

template <typename Attrs>
ov::optional<executor::Config<Attrs>> requiresFallbackCommon(const executor::Config<Attrs>& config,
const TypeMapping& typeMapping,
const LayoutConfig& layoutConfig,
const MappingNotation& notation) {
const auto typeConfig = getTypeConfiguration(config.descs, typeMapping, notation);

if (fullyMatchConfiguration(config.descs, typeConfig, layoutConfig, notation)) {
return {};
}

const auto optimalDescriptors = createOptimalDescriptors(config.descs, typeConfig, layoutConfig, notation);

return ov::optional<executor::Config<Attrs>>(GEMMConfig{optimalDescriptors, config.attrs, config.postOps});
}

OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noWeightsDecompression(const GEMMConfig& config) {
return !DnnlFCPrimitive::useWeightsDecompressionImpl(srcType(config), weiType(config), config.attrs.modelType);
}

OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noSparseDecompression(const GEMMConfig& config) {
return !(config.attrs.sparseWeights);
}

OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noPostOps(const FCConfig& config) {
return config.postOps.empty();
}

template <>
const std::vector<ExecutorImplementation<GEMMAttrs>>& getImplementations() {
static const std::vector<ExecutorImplementation<GEMMAttrs>> gemmImplementations {
OV_CPU_INSTANCE_ACL(
"matmul_acl",
ExecutorType::Acl,
OperationType::MatMul,
ShapeTolerance::Agnostic,
// supports
[](const GEMMConfig& config) -> bool {
VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS);
VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION);
return ACLGEMMExecutor::supports(config);
},
// requiresFallback
[](const GEMMConfig& config) -> ov::optional<executor::Config<GEMMAttrs>> {
return requiresFallbackCommon(config,
aclMatMulTypeMapping,
aclMatMulLayoutConfig,
aclMatMulMappingNotation);
},
// acceptsShapes
[](const MemoryArgs& memory) -> bool {
// @todo create syntactic sugar (functor) for shape agnostic lambda
return true;
},
// create
[](const GEMMAttrs& attrs,
const PostOps& postOps,
const MemoryArgs& memory,
const ExecutorContext::CPtr context) {
return std::make_shared<ACLGEMMExecutor>(attrs, postOps, memory, context);
})
};

return gemmImplementations;
}
} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "nodes/executors/executor_implementation.hpp"
#include "nodes/executors/fullyconnected_config.hpp"
#include "nodes/executors/gemm_config.hpp"

namespace ov {
namespace intel_cpu {
Expand All @@ -26,7 +27,9 @@ const std::vector<ExecutorImplementation<Attrs>>& getImplementations() {
template <>
const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations();

// ...
// MatMul
template <>
const std::vector<ExecutorImplementation<GEMMAttrs>>& getImplementations();

} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit 5e23e30

Please sign in to comment.