From 4d3f499000ad522c25aec5b85b59df9e2cba546b Mon Sep 17 00:00:00 2001 From: Edward Shogulin Date: Fri, 12 Jul 2024 20:47:58 +0100 Subject: [PATCH] [CPU] [ARM] int8: GEMM support --- src/plugins/intel_cpu/src/config.h | 3 +- src/plugins/intel_cpu/src/cpu_memory.cpp | 3 +- .../executors/acl/acl_common_executor.cpp | 15 ++ .../executors/acl/acl_fullyconnected.cpp | 44 +---- .../executors/acl/acl_fullyconnected.hpp | 7 - .../src/nodes/executors/acl/acl_gemm.cpp | 92 +++++++++ .../src/nodes/executors/acl/acl_gemm.hpp | 46 +++++ .../nodes/executors/fullyconnected_config.hpp | 3 +- .../src/nodes/executors/gemm_attrs.hpp | 34 ++++ .../src/nodes/executors/gemm_config.hpp | 14 ++ .../nodes/executors/gemm_implementations.cpp | 179 ++++++++++++++++++ .../src/nodes/executors/implementations.hpp | 4 + .../src/nodes/executors/printers.cpp | 6 + .../src/nodes/executors/printers.hpp | 2 + src/plugins/intel_cpu/src/nodes/matmul.cpp | 90 +++++++++ src/plugins/intel_cpu/src/nodes/matmul.h | 31 ++- src/plugins/intel_cpu/src/plugin.cpp | 10 + .../arm/pass/mat_mul_decomposition.cpp | 51 +++++ .../arm/pass/mat_mul_decomposition.hpp | 19 ++ .../aarch64/pass/snippets_mark_skipped.cpp | 45 +---- .../transformation_pipeline.cpp | 22 ++- .../fully_connected_transformation.cpp | 63 ------ .../aarch64/mat_mul_transformation.cpp | 119 ++++++++++++ .../mat_mul_transformation.hpp | 2 + .../mat_mul_transformation.cpp | 21 +- .../include/ov_lpt_models/mat_mul.hpp | 3 +- .../ov_helpers/ov_lpt_models/src/mat_mul.cpp | 17 +- 27 files changed, 772 insertions(+), 173 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/gemm_attrs.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/gemm_config.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/gemm_implementations.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mat_mul_decomposition.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mat_mul_decomposition.hpp delete mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 4ca6332c25c3cc..f6cd70d4cf9441 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -44,7 +44,8 @@ struct Config { Unknown }; - bool collectPerfCounters = false; + // TODO: workaround to collect performance counters + bool collectPerfCounters = true; bool exclusiveAsyncRequests = false; SnippetsMode snippetsMode = SnippetsMode::Enable; std::string dumpToDot = {}; diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp index 515fe92845702c..de113cdcffeca0 100644 --- a/src/plugins/intel_cpu/src/cpu_memory.cpp +++ b/src/plugins/intel_cpu/src/cpu_memory.cpp @@ -475,6 +475,7 @@ void DnnlMemoryMngr::notifyUpdate() { StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) : m_eng(eng), m_pMemDesc(desc) { + OPENVINO_ASSERT(!desc->empty() || (desc->empty() && (data == nullptr))); if (desc->getPrecision() == element::string) { OPENVINO_THROW("[CPU] StaticMemory object cannot be created for string data."); } @@ -484,7 +485,7 @@ StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const vo m_size = m_pMemDesc->getCurrentMemSize(); - if (data) { + if (data || desc->empty()) { m_pMemMngr = std::make_shared(const_cast(data), m_size); } else { m_pMemMngr = std::make_shared(m_size); diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp index d4a75681d529c8..3c19534e88de3b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "src/core/utils/quantization/AsymmHelpers.h" + #include "acl_common_executor.hpp" #include "acl_utils.hpp" #include "nodes/executors/memory_arguments.hpp" @@ -66,7 +68,16 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) { ACLMemoryTypes aclDataType{}; ACLMemoryLayouts aclDataLayout{}; for (auto& cpu_mem_ptr : memory) { + // TODO: don't init empty tensor + if (cpu_mem_ptr.second->getSize() == 0) { + continue; + } const ACLArgs index = argConvert.at(cpu_mem_ptr.first); + + if (index == ACLArgs::ACL_DST) { + std::cout << std::endl; + } + initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs, aclMemoryShapes[index], aclDataType[index], @@ -79,6 +90,9 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) { // Initialize arm_compute::TensorInfo objects ACLMemoryInfo aclMemoryInfos; for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) { + if (i == ACLArgs::ACL_DST) { + std::cout << std::endl; + } aclMemoryInfos[i] = initTensorInfo(aclMemoryShapes[i], aclDataType[i], aclDataLayout[i]); } @@ -108,6 +122,7 @@ void ACLCommonExecutor::execute(const MemoryArgs &memory) { aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData()); } } + iFunction->run(); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index 4a034d1c9013fd..5e5089cd6f755c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -19,9 +19,6 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); fullyConnectedLayerInfo.transpose_weights = !attrs.weightsNonTransposed; - if (!attrs.dequantizationScales.empty()) { - dequantizationScale = attrs.dequantizationScales[0]; - } // Add postops if (!postOps.empty() && postOps.size() == 1) { @@ -35,20 +32,10 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const } bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { - // issue # - const auto attrs = static_cast(config.attrs); - if (std::any_of( - attrs.dequantizationScales.begin(), - attrs.dequantizationScales.end(), - [](float value) { return value != 1.f;})) { - return false; - } - - VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32, ov::element::i8), UNSUPPORTED_SRC_PRECISIONS); + VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_SRC_PRECISIONS); VERIFY(postOpsNumbers(config) < 2, UNSUPPORTED_NUMBER_OF_POSTOPS); VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); VERIFY(one_of(weiRank(config), 2U, 3U), UNSUPPORTED_WEI_RANK); - VERIFY(static_cast(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION); return true; } @@ -87,43 +74,16 @@ arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLMemo } ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLMemoryTensors & aclMemoryTensors) { - const auto dstTensor = aclMemoryTensors.at(ACLArgs::ACL_DST).get(); - if (dequantizationScale != 1.0) { - dstTensor->info()->set_quantization_info(arm_compute::QuantizationInfo(dequantizationScale, 0)); - } - auto neFC = std::make_unique(); neFC->configure( aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), aclMemoryTensors[ACLArgs::ACL_WEI].get(), aclMemoryTensors[ACLArgs::ACL_BIAS].get(), - dstTensor, + aclMemoryTensors[ACLArgs::ACL_DST].get(), fullyConnectedLayerInfo, weightsInfo); return neFC; } -ACLInfo ACLFullyConnectedExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape, - const arm_compute::DataType& dataType, - const arm_compute::DataLayout& dataLayout) { - arm_compute::DataType fcDataType; - switch (dataType) { - case arm_compute::DataType::S8: { - fcDataType = arm_compute::DataType::QASYMM8_SIGNED; - break; - } - case arm_compute::DataType::U8: { - fcDataType = arm_compute::DataType::QASYMM8; - break; - } - default: { - fcDataType = dataType; - break; - } - } - - return ACLCommonExecutor::initTensorInfo(tensorShape, fcDataType, dataLayout); -} - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp index f44c221403cc93..ffd45c77a5a77b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp @@ -28,16 +28,9 @@ class ACLFullyConnectedExecutor : public ACLCommonExecutor { impl_desc_type implType() const override { return impl_desc_type::gemm_acl; } - -protected: - ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape, - const arm_compute::DataType& dataType, - const arm_compute::DataLayout& dataLayout) override; - private: arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; arm_compute::WeightsInfo weightsInfo; - float dequantizationScale = 1.f; }; using ACLFullyConnectedExecutorPtr = std::shared_ptr; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp new file mode 100644 index 00000000000000..076bd2720c689c --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp @@ -0,0 +1,92 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "acl_gemm.hpp" + +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" + +#include "nodes/executors/executor.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "utils/debug_capabilities.h" +#include "nodes/executors/debug_messages.hpp" +#include "nodes/executors/implementation_utils.hpp" + +namespace ov { +namespace intel_cpu { + +ACLGEMMExecutor::ACLGEMMExecutor(const GEMMAttrs &attrs, + const PostOps &postOps, + const MemoryArgs &memory, + const ExecutorContext::CPtr context) { + aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); +} + +bool ACLGEMMExecutor::supports(const GEMMConfig &config) { + // TODO: check weights layout + const auto attrs = static_cast(config.attrs); + if (std::any_of( + attrs.dequantizationScales.begin(), + attrs.dequantizationScales.end(), + [](float value) { return value != 1.f;})) { + return false; + } + + const auto src1_dims = std::dynamic_pointer_cast(config.descs.at(ARG_SRC))->getBlockDims(); + const auto src2_dims = std::dynamic_pointer_cast(config.descs.at(ARG_WEI))->getBlockDims(); + + VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32, ov::element::i8, ov::element::u8), UNSUPPORTED_SRC_PRECISIONS); + VERIFY(postOpsNumbers(config) < 2, UNSUPPORTED_NUMBER_OF_POSTOPS); + VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); + VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK); + VERIFY(static_cast(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION); + return true; +} + +void ACLGEMMExecutor::updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) {} + +arm_compute::Status ACLGEMMExecutor::validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) { + const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate( + aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_WEI].get(), + aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + gemmInfo); + return matMulValid; +} + +ACLFunction ACLGEMMExecutor::configureFunction(const ACLMemoryTensors & aclMemoryTensors) { + auto matMull = std::make_unique(); + matMull->configure( + aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_WEI].get(), + // TODO: fix me + nullptr, //aclMemoryTensors[ACLArgs::ACL_BIAS].get(), + aclMemoryTensors.at(ACLArgs::ACL_DST).get()); + return matMull; +} + +ACLInfo ACLGEMMExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape, + const arm_compute::DataType& dataType, + const arm_compute::DataLayout& dataLayout) { + arm_compute::DataType fcDataType; + switch (dataType) { + case arm_compute::DataType::S8: { + fcDataType = arm_compute::DataType::QASYMM8_SIGNED; + break; + } + case arm_compute::DataType::U8: { + fcDataType = arm_compute::DataType::QASYMM8; + break; + } + default: { + fcDataType = dataType; + break; + } + } + + return ACLCommonExecutor::initTensorInfo(tensorShape, fcDataType, dataLayout); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp new file mode 100644 index 00000000000000..a1a537da7b6a1d --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "acl_common_executor.hpp" +#include "nodes/executors/gemm_config.hpp" + +namespace ov { +namespace intel_cpu { + +class ACLGEMMExecutor : public ACLCommonExecutor { +public: + ACLGEMMExecutor(const GEMMAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory, + const ExecutorContext::CPtr context); + + static bool supports(const GEMMConfig& config); + + void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override; + + arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override; + + ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override; + + impl_desc_type implType() const override { + return impl_desc_type::gemm_acl; + } + +protected: + ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape, + const arm_compute::DataType& dataType, + const arm_compute::DataLayout& dataLayout) override; + +private: + arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; + arm_compute::GEMMInfo gemmInfo; + arm_compute::WeightsInfo weightsInfo; +}; + +using ACLGEMMExecutorPtr = std::shared_ptr; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp index ad6479597c6971..0fffcdb2ca2f8d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp @@ -17,7 +17,8 @@ struct FCAttrs { // @todo probably we don't want with bias flag, since this information is already // a part of src memory descs bool withBias = false; - bool weightsNonTransposed = false; + // TODO: why default is false??? + bool weightsNonTransposed = true; bool sparseWeights = false; // @todo only memory descriptors should be a part of attributes // actual memory should be passed into "execute" or "prepareMemory" calls diff --git a/src/plugins/intel_cpu/src/nodes/executors/gemm_attrs.hpp b/src/plugins/intel_cpu/src/nodes/executors/gemm_attrs.hpp new file mode 100644 index 00000000000000..e76ea336e6df58 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/gemm_attrs.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "cpu_memory.h" +#include "executor_config.hpp" + +namespace ov { +namespace intel_cpu { + +// @todo require explicit initialization of all the attributes? +struct GEMMAttrs { + // @todo probably we don't want with bias flag, since this information is already + // a part of src memory descs + bool withBias = false; + // TODO: why default is false??? + bool weightsNonTransposed = true; + bool sparseWeights = false; + // @todo only memory descriptors should be a part of attributes + // actual memory should be passed into "execute" or "prepareMemory" calls + std::vector dequantizationScales; + // @todo should be passed as an additional memory input? + MemoryCPtr decompressionSubtractPtr; + MemoryCPtr decompressionMultiplyPtr; + uint64_t dynamicQuantizationGroupSize; + ov::intel_cpu::Config::ModelType modelType = ov::intel_cpu::Config::ModelType::Unknown; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/gemm_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/gemm_config.hpp new file mode 100644 index 00000000000000..b7180596b1a5aa --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/gemm_config.hpp @@ -0,0 +1,14 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "executor_config.hpp" +#include "gemm_attrs.hpp" + +namespace ov { +namespace intel_cpu { +using GEMMConfig = ov::intel_cpu::executor::Config; +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/gemm_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/gemm_implementations.cpp new file mode 100644 index 00000000000000..181e4e45a7bc6f --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/gemm_implementations.cpp @@ -0,0 +1,179 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include "cpu/x64/cpu_isa_traits.hpp" +#include "debug_messages.hpp" +#include "implementation_utils.hpp" +#include "memory_desc/cpu_memory_desc.h" +#include "nodes/executors/gemm_config.hpp" +#include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp" +#include "nodes/executors/dnnl/dnnl_fullyconnected.hpp" +#include "nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp" +#include "nodes/executors/executor.hpp" +#include "nodes/executors/executor_implementation.hpp" +#include "nodes/executors/implementations.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "nodes/executors/precision_matcher.hpp" +#include "nodes/executors/precision_translation.hpp" +#include "nodes/executors/type_mask.hpp" +#include "openvino/core/type/element_type.hpp" +#include "ov_optional.hpp" +#include "utils/cpp/maybe_unused.hpp" + +#if defined(OV_CPU_WITH_ACL) +#include "nodes/executors/acl/acl_fullyconnected.hpp" +#include "nodes/executors/acl/acl_gemm.hpp" +#endif + +namespace ov { +namespace intel_cpu { + +using namespace ov::element; +using namespace TypeMaskAlias; +using namespace executor; + +static const MappingNotation dnnlFCMappingNotation{ARG_SRC, ARG_WEI, ARG_BIAS, ARG_DST}; + +using LayoutConfig = std::vector; +static const LayoutConfig dnnlFCLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}; +static const LayoutConfig aclFCLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}; +static const LayoutConfig aclMatMulLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}; + +template +struct Require { + bool operator()() { + return dnnl::impl::cpu::x64::mayiuse(ISA); + } +}; + +static const TypeMapping aclMatMulTypeMapping { + // {src, wei, bia, dst} pt + {{_i8, _i8, _any, _any}, pt(just(), just(), just(), just())}, + {{_u8, _u8, _any, _any}, pt(just(), just(), just(), just())}, + {{_any, _any, _any, _any}, pt(just(), just(), just(), just())} +}; + +static const MappingNotation aclMatMulMappingNotation { + ARG_SRC, ARG_WEI, ARG_BIAS, ARG_DST +}; + +// clang-format on + +static bool fullyMatchConfiguration(const MemoryDescArgs& currentDescriptors, + const InOutTypes& typeConfig, + const LayoutConfig& layoutConfig, + const MappingNotation& notation) { + for (size_t i = 0; i < typeConfig.size(); i++) { + const auto& type = typeConfig[i]; + const auto& desc = currentDescriptors.at(notation[i]); + + if (desc->empty()) + continue; + + if (desc->getPrecision() != type) + return false; // type mismatch + + if (!desc->hasLayoutType(layoutConfig[i])) + return false; // layout mismatch + } + + return true; +} + +static MemoryDescArgs createOptimalDescriptors(const MemoryDescArgs& currentDescriptors, + const InOutTypes& typeConfig, + const LayoutConfig& layoutConfig, + const MappingNotation& notation) { + MemoryDescArgs descs = currentDescriptors; + + const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); + for (size_t i = 0; i < typeConfig.size(); i++) { + const auto& desc = currentDescriptors.at(notation[i]); + const auto& descType = desc->getPrecision(); + const auto& type = typeConfig[i]; + const auto& layout = layoutConfig[i]; + + if (desc->empty()) + continue; + + if (descType == type && desc->hasLayoutType(layout)) { + continue; + } + + descs[notation[i]] = creatorsMap.at(layout)->createSharedDesc(type, desc->getShape()); + } + + return descs; +} + +template +ov::optional> requiresFallbackCommon(const executor::Config& config, + const TypeMapping& typeMapping, + const LayoutConfig& layoutConfig, + const MappingNotation& notation) { + const auto typeConfig = getTypeConfiguration(config.descs, typeMapping, notation); + + if (fullyMatchConfiguration(config.descs, typeConfig, layoutConfig, notation)) { + return {}; + } + + const auto optimalDescriptors = createOptimalDescriptors(config.descs, typeConfig, layoutConfig, notation); + + return ov::optional>(GEMMConfig{optimalDescriptors, config.attrs, config.postOps}); +} + +OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noWeightsDecompression(const GEMMConfig& config) { + return !DnnlFCPrimitive::useWeightsDecompressionImpl(srcType(config), weiType(config), config.attrs.modelType); +} + +OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noSparseDecompression(const GEMMConfig& config) { + return !(config.attrs.sparseWeights); +} + +OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noPostOps(const FCConfig& config) { + return config.postOps.empty(); +} + +template <> +const std::vector>& getImplementations() { + static const std::vector> gemmImplementations { + OV_CPU_INSTANCE_ACL( + "matmul_acl", + ExecutorType::Acl, + OperationType::MatMul, + ShapeTolerance::Agnostic, + // supports + [](const GEMMConfig& config) -> bool { + VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS); + VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION); + return ACLGEMMExecutor::supports(config); + }, + // requiresFallback + [](const GEMMConfig& config) -> ov::optional> { + return requiresFallbackCommon(config, + aclMatMulTypeMapping, + aclMatMulLayoutConfig, + aclMatMulMappingNotation); + }, + // acceptsShapes + [](const MemoryArgs& memory) -> bool { + // @todo create syntactic sugar (functor) for shape agnostic lambda + return true; + }, + // create + [](const GEMMAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory, + const ExecutorContext::CPtr context) { + return std::make_shared(attrs, postOps, memory, context); + }) + }; + + return gemmImplementations; +} +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/implementations.hpp b/src/plugins/intel_cpu/src/nodes/executors/implementations.hpp index 5f0be772ab7b1a..3c02b1375f0f3f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/implementations.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/implementations.hpp @@ -8,6 +8,7 @@ #include "nodes/executors/executor_implementation.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "nodes/executors/gemm_config.hpp" namespace ov { namespace intel_cpu { @@ -26,6 +27,9 @@ const std::vector>& getImplementations() { template <> const std::vector>& getImplementations(); +template <> +const std::vector>& getImplementations(); + // ... } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/printers.cpp b/src/plugins/intel_cpu/src/nodes/executors/printers.cpp index ac52b25a069541..f9724d7b68a6dd 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/printers.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/printers.cpp @@ -8,6 +8,7 @@ #include "printers.hpp" #include "post_ops.hpp" #include "fullyconnected_config.hpp" +#include "gemm_config.hpp" namespace ov { namespace intel_cpu { @@ -17,6 +18,11 @@ std::ostream & operator<<(std::ostream & os, const FCAttrs& attrs) { return os; } +std::ostream & operator<<(std::ostream & os, const GEMMAttrs& attrs) { + // @todo print Attrs + return os; +} + std::ostream & operator<<(std::ostream & os, const PostOps& postOps) { // @todo print PostOps return os; diff --git a/src/plugins/intel_cpu/src/nodes/executors/printers.hpp b/src/plugins/intel_cpu/src/nodes/executors/printers.hpp index d37ab633ba8036..c2cfcfce112b28 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/printers.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/printers.hpp @@ -7,6 +7,7 @@ #include #include "executor_config.hpp" +#include "gemm_attrs.hpp" namespace ov { namespace intel_cpu { @@ -18,6 +19,7 @@ template struct Config; struct FCAttrs; std::ostream & operator<<(std::ostream & os, const FCAttrs& attrs); +std::ostream & operator<<(std::ostream & os, const GEMMAttrs& attrs); std::ostream & operator<<(std::ostream & os, const PostOps& postOps); template diff --git a/src/plugins/intel_cpu/src/nodes/matmul.cpp b/src/plugins/intel_cpu/src/nodes/matmul.cpp index 4355af1ea12993..dee9876405eb0d 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.cpp +++ b/src/plugins/intel_cpu/src/nodes/matmul.cpp @@ -22,6 +22,7 @@ #include "common/primitive_hashing_utils.hpp" #include "cpu/x64/cpu_isa_traits.hpp" #include "shape_inference/custom/matmul.hpp" + using namespace dnnl; @@ -244,6 +245,15 @@ static VectorDims getStridesAndModifyShape(Shape& shape, const bool transpose) { return strides; } +#if defined(OPENVINO_ARCH_ARM64) and !defined(OPENVINO_MAT_MUL_REFERENCE) +ExecutorPtr MatMul::createExecutor() { + const auto& executor = factory->make(memory); + getSelectedPrimitiveDescriptor()->setImplementationType(executor->implType()); + + return executor; +} +#endif + dnnl::memory::desc MatMul::getBiasDescFrom(const DnnlMemoryDescCPtr outMemDesc) { // oneDNN matmul requires shape for bias desc to be the same rank VectorDims biasDims(outMemDesc->getShape().getRank(), 1); @@ -463,6 +473,59 @@ void MatMul::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; +#if defined(OPENVINO_ARCH_ARM64) and !defined(OPENVINO_MAT_MUL_REFERENCE) + attrs.withBias = getOriginalInputsNumber() == 3; + attrs.dequantizationScales = getDQScales(); + // TODO: not supported for ARM + //attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS_ID)->getParent(), + // getOriginalInputPrecisionAtPort(DATA_ID), + // context->getConfig().fcSparseWeiDecompressionRate); + attrs.sparseWeights = false; + attrs.dynamicQuantizationGroupSize = context->getConfig().fcDynamicQuantizationGroupSize; + attrs.modelType = context->getConfig().modelType; + + postOps = getPostOps(fusedWith); + + const auto& srcTypes = getOriginalInputPrecisions(); + auto dstTypes = getOriginalOutputPrecisions(); + // @todo graph optimizer should update original output precisions instead + if (!fusedWith.empty()) + dstTypes = fusedWith.back()->getOriginalOutputPrecisions(); + + VecMemoryDescs srcDescs; + const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); + for (size_t i = 0; i < srcTypes.size(); i++) { + const auto srcDesc = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(srcTypes[i], getInputShapeAtPort(i)); + srcDescs.push_back(srcDesc); + } + + VecMemoryDescs dstDescs; + for (size_t i = 0; i < dstTypes.size(); i++) { + const auto dstDesc = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(dstTypes[i], getOutputShapeAtPort(i)); + dstDescs.push_back(dstDesc); + } + + MemoryDescArgs memoryDescs { + {ARG_SRC, srcDescs[0]}, + {ARG_WEI, srcDescs[1]}, + {ARG_BIAS, attrs.withBias ? srcDescs[2] : MemoryDescUtils::makeEmptyDesc()}, + {ARG_DST, dstDescs[0]}, + }; + + auto executionContext = std::make_shared(context, getImplPriority(), privateWeightCache); + factory = std::make_shared>(attrs, postOps, executionContext, memoryDescs); + const auto nodeDescriptors = factory->getProperMemoryDescriptors(memoryDescs); + + NodeConfig nodeConfig; + nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_SRC)); + nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_WEI)); + if (attrs.withBias) nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_BIAS)); + + const int inPlace = canBeInPlace() ? 0 : -1; + nodeConfig.outConfs.emplace_back(nodeDescriptors.at(ARG_DST), BlockedMemoryDesc::FULL_MASK, inPlace); + + supportedPrimitiveDescriptors.emplace_back(nodeConfig, impl_desc_type::undef); +#else auto addSupportedPrimitiveDescriptor = [&](const dnnl::primitive_desc& prim_desc) { std::vector inConfs, outConfs; const int inPlaceOutPort = canBeInPlace() ? 0 : -1; @@ -502,8 +565,27 @@ void MatMul::initSupportedPrimitiveDescriptors() { if (supportedPrimitiveDescriptors.empty()) addSupportedPrimitiveDescriptor(first_desc); } +#endif } +#if defined(OPENVINO_ARCH_ARM64) and !defined(OPENVINO_MAT_MUL_REFERENCE) +void MatMul::createPrimitive() { + memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID); + memory[ARG_WEI] = getSrcMemoryAtPort(WEIGHTS_ID); + // TODO: we don't need allocate empty memory + memory[ARG_BIAS] = attrs.withBias ? getSrcMemoryAtPort(BIAS_ID) : MemoryDescUtils::makeEmptyMemory(context); +// if (attrs.withBias) { +// memory[ARG_BIAS] = getSrcMemoryAtPort(BIAS_ID); +// } + memory[ARG_DST] = getDstMemoryAtPort(0); + // @todo should we preconfigure only for dynamic shapes? + // Since for static shapes primitive is created in scope of compile_model() anyway + factory->preconfigure(memory); + + Node::createPrimitive(); +} +#endif + MemoryDescPtr MatMul::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { auto desc = idx > 0 ? prim_desc.weights_desc(idx - 1): prim_desc.src_desc(idx); @@ -524,6 +606,9 @@ ov::element::Type MatMul::getRuntimePrecision() const { } void MatMul::prepareParams() { +#if defined(OPENVINO_ARCH_ARM64) and !defined(OPENVINO_MAT_MUL_REFERENCE) + executor = createExecutor(); +#else auto dstMemPtr = getDstMemoryAtPort(0); auto src0MemPtr = getSrcMemoryAtPort(0); auto src1MemPtr = getSrcMemoryAtPort(1); @@ -630,14 +715,19 @@ void MatMul::prepareParams() { auto pd = execPtr->getPrimitiveDesc(); DEBUG_LOG("verbose##", getName(), "##", DnnlExtensionUtils::query_pd_info(pd), "\n"); #endif +#endif } void MatMul::execute(dnnl::stream strm) { +#if defined(OPENVINO_ARCH_ARM64) and !defined(OPENVINO_MAT_MUL_REFERENCE) + executor->execute(memory); +#else if (execPtr) { execPtr->exec(primArgs, strm); } else { OPENVINO_THROW(errorPrefix, " doesn't have an initialized executor"); } +#endif } void MatMul::executeDynamicImpl(dnnl::stream strm) { diff --git a/src/plugins/intel_cpu/src/nodes/matmul.h b/src/plugins/intel_cpu/src/nodes/matmul.h index 7b8f064e17260b..383347d22e033c 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.h +++ b/src/plugins/intel_cpu/src/nodes/matmul.h @@ -4,11 +4,21 @@ #pragma once +#include + #include "common/dnnl_executor.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "node.h" -#include +// TODO: debug only +//#define OPENVINO_MAT_MUL_REFERENCE + +#if defined(OPENVINO_ARCH_ARM64) and !defined(OPENVINO_MAT_MUL_REFERENCE) +#include "nodes/executors/executor_factory.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "nodes/executors/fullyconnected_config.hpp" +#include "post_ops.hpp" +#endif namespace ov { namespace intel_cpu { @@ -22,6 +32,11 @@ class MatMul : public Node { void createDescriptor(const std::vector& inputDesc, const std::vector& outputDesc) override; void initSupportedPrimitiveDescriptors() override; + +#if defined(OPENVINO_ARCH_ARM64) and !defined(OPENVINO_MAT_MUL_REFERENCE) + void createPrimitive() override; +#endif + MemoryDescPtr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; bool canFuse(const NodePtr& node) const override; bool created() const override; @@ -65,6 +80,20 @@ class MatMul : public Node { std::array inDataDesc; DnnlBlockedMemoryDescPtr outDataDesc; + +#if defined(OPENVINO_ARCH_ARM64) and !defined(OPENVINO_MAT_MUL_REFERENCE) + static const size_t DATA_ID = 0; + static const size_t WEIGHTS_ID = 1; + static const size_t BIAS_ID = 2; + + ExecutorPtr createExecutor(); + + GEMMAttrs attrs; + PostOps postOps; + MemoryArgs memory; + ExecutorFactoryPtr factory; + ExecutorPtr executor; +#endif }; } // namespace node diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 06e09029eadfe8..3cb2aead53ce7b 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -295,6 +295,11 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< } transformations.PostLpt(); + + ov::pass::Serialize( + "/Users/eshoguli/projects/openvino_matmul/cpu.post_lpt.xml", + "/Users/eshoguli/projects/openvino_matmul/cpu.post_lpt.bin").run_on_model(cloned_model); + transformations.Snippets(); transformations.CpuSpecificOpSet(); @@ -330,6 +335,11 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< denormals_as_zero(false); } } + + ov::pass::Serialize( + "/Users/eshoguli/projects/openvino_matmul/cpu.transformed.xml", + "/Users/eshoguli/projects/openvino_matmul/cpu.transformed.bin").run_on_model(cloned_model); + return std::make_shared(cloned_model, shared_from_this(), conf, false); } diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mat_mul_decomposition.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mat_mul_decomposition.cpp new file mode 100644 index 00000000000000..f4ddb20ea9320f --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mat_mul_decomposition.cpp @@ -0,0 +1,51 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + + +#include "mat_mul_decomposition.hpp" + +#include "ov_ops/type_relaxed.hpp" + +#include "openvino/opsets/opset1.hpp" +#include "openvino/core/rt_info.hpp" + +ov::intel_cpu::MatMulDecomposition::MatMulDecomposition() { + auto matMul = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [](ov::pass::pattern::Matcher& m) { + auto matMul = std::dynamic_pointer_cast(m.get_match_root()); + if (!matMul) { + return false; + } + + // TODO: is it possible to move to matcher? + const auto in_type1 = matMul->get_input_element_type(0); + const auto in_type2 = matMul->get_input_element_type(1); + if ((in_type1 != element::i8) && (in_type1 != element::u8) && (in_type2 != element::u8) && (in_type2 != element::u8)) { + return false; + } + +// const std::shared_ptr newMatMul = std::make_shared>( +// std::vector({ deqPrecision, deqPrecision }), std::vector({ deqPrecision }), +// ov::op::TemporaryReplaceOutputType(dequantization1.data, deqPrecision).get(), +// ov::op::TemporaryReplaceOutputType(dequantization2.data, deqPrecision).get(), +// matMul->get_transpose_a(), +// matMul->get_transpose_b()); +// const auto newMatMul = std::make_shared>( +// parent, +// std::make_shared(deqPrecision, ov::Shape({}), std::vector({ dequantizationSub })), +// matMul->get_transpose_a(), +// matMul->get_transpose_b()); + + const auto newMatMul = matMul->clone_with_new_inputs({matMul->get_input_source_output(0), matMul->get_input_source_output(1)}); + // TODO: output type is hardcoded + newMatMul->set_output_type(0, element::i32, matMul->get_output_partial_shape(0)); + const auto convert = std::make_shared(newMatMul, element::f32); + replace_node(matMul, convert); + ov::copy_runtime_info(matMul, {newMatMul, convert}); + return true; + }; + + auto m = std::make_shared(matMul, "MatMulDecomposition"); + register_matcher(m, callback); +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mat_mul_decomposition.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mat_mul_decomposition.hpp new file mode 100644 index 00000000000000..447a3a0a85de49 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mat_mul_decomposition.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_cpu { + +class MatMulDecomposition: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("MatMulDecomposition", "0"); + MatMulDecomposition(); +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp index 99b7421dbcfd01..026d8560991bf7 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp @@ -72,48 +72,6 @@ bool isFullyConnected(const std::shared_ptr& node) { ov::op::util::is_on_constant_path(out_weights); } -// TODO: move to base type -bool canBePerformedAsScaleShift(const std::shared_ptr &node, const int channelAxis) { - size_t fusingPort = 0; - size_t numNonConstInputs = 0; - ov::PartialShape dataShape; - for (size_t i = 0; i < node->get_input_size(); i++) { - const auto parent = node->get_input_node_shared_ptr(i); - if (!ov::is_type(parent)) { - fusingPort = i; - dataShape = node->get_input_partial_shape(i); - // only one non-const parent is allowed - if (++numNonConstInputs != 1) - return false; - } else { - // every const parent must have exactly one child - const auto out = parent->outputs(); - const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1); - if (!has_only_child) - return false; - } - } - - const auto isBroadcastableToDataInput = [&]() { - for (size_t i = 0; i < node->get_input_size(); i++) { - if (i == fusingPort) - continue; - const ov::PartialShape weightShape = node->get_input_partial_shape(i); - if (!isPerTensorOrPerChannelBroadcastable(dataShape.get_max_shape(), weightShape.get_max_shape(), channelAxis, true)) - return false; - } - return true; - }; - - // Prelu and MulAdd are still ignored - // isConvertablePowerStatic() is ignored - return (ov::is_type(node) || - ov::is_type(node) || - ov::is_type(node) || - ov::is_type(node)) && - isBroadcastableToDataInput(); -} - bool SupportsFusingWithConvolution_Simple(const std::shared_ptr &node, const int channelAxis = DEFAULT_AXIS) { // Note: some other operations support this fusing (SoftPlus, Sqrt). // Skip them here, when they are supported by Snippets ARM. Ticket: 141170. @@ -122,8 +80,7 @@ bool SupportsFusingWithConvolution_Simple(const std::shared_ptr &nod ov::is_type(node) || ov::is_type(node) || ov::is_type(node) || - ov::is_type(node) || - canBePerformedAsScaleShift(node, channelAxis); + ov::is_type(node); } // Convolution is a special case, since it supports peculiar fusings bool isSuitableConvolutionParent(const std::shared_ptr &node) { diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 63a946cfa69955..7406cba96c8c55 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -113,6 +113,7 @@ #include "transformations/cpu_opset/convert_to_cpu_specific_opset.hpp" #if defined(OPENVINO_ARCH_ARM64) #include "transformations/snippets/aarch64/pass/snippets_mark_skipped.hpp" +#include "transformations/cpu_opset/arm/pass/mat_mul_decomposition.hpp" #else #include "transformations/snippets/x64/pass/snippets_mark_skipped.hpp" #endif @@ -283,10 +284,23 @@ void Transformations::UpToLpt() { const auto defaultPrecisions = useLpt ? precision_set::get_int8_support() : std::vector{}; + ov::pass::Serialize( + "/Users/eshoguli/projects/openvino_matmul/cpu.original.xml", + "/Users/eshoguli/projects/openvino_matmul/cpu.original.bin").run_on_model(model); + PreLpt(defaultPrecisions); - if (useLpt) + ov::pass::Serialize( + "/Users/eshoguli/projects/openvino_matmul/cpu.pre_lpt.xml", + "/Users/eshoguli/projects/openvino_matmul/cpu.pre_lpt.bin").run_on_model(model); + + if (useLpt) { Lpt(defaultPrecisions); + + ov::pass::Serialize( + "/Users/eshoguli/projects/openvino_matmul/cpu.lpt.xml", + "/Users/eshoguli/projects/openvino_matmul/cpu.lpt.bin").run_on_model(model); + } } void Transformations::SetSubStreamNum(int SubStreams) { @@ -698,7 +712,11 @@ void Transformations::Lpt(const std::vector& defaultPrecision }), PrecisionsRestriction::create({ {{0}, {ov::element::u8, ov::element::i8}}, +#if defined(OPENVINO_ARCH_ARM64) + {{1}, {ov::element::u8, ov::element::i8}} +#else {{1}, {ov::element::i8}} +#endif }), PrecisionsRestriction::create({ {{0, 1}, {ov::element::u8}} @@ -834,6 +852,8 @@ void Transformations::PostLpt() { auto symbolic_pipeline = CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::SymbolicOptimizations, false); symbolic_pipeline->get_manager()->register_pass(); + CPU_REGISTER_PASS_ARM64(postLPTPassManager, ov::intel_cpu::MatMulDecomposition); + postLPTPassManager.run_passes(model); } diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp deleted file mode 100644 index 6111ec85f92aeb..00000000000000 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/fully_connected_transformation.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include "low_precision_transformations/fully_connected_transformation.hpp" -#include "common_test_utils/test_constants.hpp" - -using namespace LayerTestsDefinitions; - -namespace { -const std::vector netPrecisions = { - ov::element::f32 -}; - -const std::vector shapes = { - { - ov::PartialShape{ 1, 16 }, - ov::PartialShape{ 16, 8 }, - false, - false - }, - { - ov::PartialShape{ 1, 16 }, - ov::PartialShape{ 8, 16 }, - false, - true - }, - { - ov::PartialShape{ 16, 1 }, - ov::PartialShape{ 16, 8 }, - true, - false - }, -}; - -const std::vector trasformationParamValues = { - LayerTestsUtils::LayerTransformationParamsNGraphFactory::createParams() -}; - -INSTANTIATE_TEST_SUITE_P(smoke_LPT_ref, FullyConnectedTransformation, - ::testing::Combine( - ::testing::ValuesIn(netPrecisions), - ::testing::ValuesIn(shapes), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::ValuesIn(trasformationParamValues), - ::testing::ValuesIn({ov::element::i8}), - ::testing::Values(false), - ::testing::Values("gemm_ref_i8")), - FullyConnectedTransformation::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P(smoke_LPT_acl, FullyConnectedTransformation, - ::testing::Combine( - ::testing::ValuesIn(netPrecisions), - ::testing::ValuesIn(shapes), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::ValuesIn(trasformationParamValues), - ::testing::ValuesIn({ov::element::i8}), - ::testing::Values(true), - ::testing::Values("gemm_acl_i8")), - FullyConnectedTransformation::getTestCaseName); -} // namespace diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp new file mode 100644 index 00000000000000..8aefa735f5da32 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp @@ -0,0 +1,119 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "low_precision_transformations/mat_mul_transformation.hpp" + +using namespace LayerTestsDefinitions; + +namespace { +const std::vector precisions = { + ov::element::f32 +}; + +std::vector testValues = { + // supported on arm only + { + { 1, 1, 12, 2 }, + { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, + { 1, 1, 2, 12 }, + { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, + "matMul", + "u8", + false + }, + // supported on arm only: LPT was updated + { + { 1, 1, 12, 2 }, + { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, + { 1, 1, 2, 12 }, + { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, + "matMul", + "u8", + true + }, +// // supported on arm only: should update LPT and uncomment +// { +// { 1, 1, 12, 2 }, +// { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, +// { 1, 1, 2, 12 }, +// { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, +// "matMul", +// "u8", +// false +// }, + +// { +// { 1, 4, 12, 2 }, +// { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, +// { 1, 4, 2, 12 }, +// { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, +// "matMul_original", +// "u8" +// }, +// { +// { 8, 4, 12, 2 }, +// { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, +// { 8, 4, 2, 12 }, +// { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, +// "matMul_original", +// "u8" +// }, + + { + { 1, 1, 12, 8 }, + { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, + { 1, 1, 8, 6 }, + { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, + "matMul", + "i8", + false + }, + { + { 1, 1, 12, 2 }, + { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, + { 1, 1, 2, 12 }, + { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, + "matMul", + "i8", + true + }, + +// { +// { 1, 1, 1, 4, 12, 2 }, +// { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, +// { 1, 1, 1, 4, 2, 12 }, +// { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, +// "matMul_original", +// "i8" +// }, + { + { 12 }, + { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, + { 12 }, + { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, + "matMul_original/MM", + "i8", + false, + }, + { + { 12 }, + { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, + { 12 }, + { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, + "matMul_original/MM", + "u8", + false, + }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_LPT, MatMulTransformation, + ::testing::Combine( + ::testing::ValuesIn(precisions), + ::testing::Values(ov::PartialShape({ 1, 384, 1024 })), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::ValuesIn(testValues)), + MatMulTransformation::getTestCaseName); +} // namespace diff --git a/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp b/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp index 3d394c1e45674c..74dd7450130360 100644 --- a/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp +++ b/src/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_transformation.hpp @@ -19,8 +19,10 @@ class MatMulTransformationTestValues { ov::builder::subgraph::FakeQuantizeOnData fqOnData1; ov::Shape inputShape2; ov::builder::subgraph::FakeQuantizeOnData fqOnData2; + // TODO: remove, not used std::string expectedKernelName; std::string expectedRuntimePrecision; + bool requantization; }; typedef std::tuple< diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp index bc1ce628deb245..a22752bc7b857e 100644 --- a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp +++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp @@ -27,10 +27,11 @@ std::string MatMulTransformation::getTestCaseName(const testing::TestParamInfo(GetParam()); - const auto actualType = get_runtime_precision(params.expectedKernelName); + const auto& actualType = get_runtime_precision_by_type("MatMul"); + const auto expected = std::get<3>(GetParam()); + EXPECT_EQ(expected.expectedRuntimePrecision, actualType); - EXPECT_EQ(actualType, params.expectedRuntimePrecision); + const auto& actualPrimitiveType = get_property_by_type("MatMul", "primitiveType"); + const auto expectedPrimitiveType = "gemm_acl_i8"; + EXPECT_EQ(expectedPrimitiveType, actualPrimitiveType); } TEST_P(MatMulTransformation, CompareWithRefImpl) { diff --git a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp index 693ec58248f62d..120d15ea1ba342 100644 --- a/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp +++ b/src/tests/ov_helpers/ov_lpt_models/include/ov_lpt_models/mat_mul.hpp @@ -36,7 +36,8 @@ class MatMulFunction { const ov::Shape& inputShape1, const FakeQuantizeOnData& fqOnData1, const ov::Shape& inputShape2, - const FakeQuantizeOnData& fqOnData2); + const FakeQuantizeOnData& fqOnData2, + const bool requantization = false); static std::shared_ptr getOriginal(const ov::element::Type netPrecision, const ov::PartialShape& inputShape1, diff --git a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp index c6499328154315..b0e2db62c19172 100644 --- a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp +++ b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp @@ -106,21 +106,30 @@ std::shared_ptr MatMulFunction::getOriginal( const ov::Shape& inputShape1, const FakeQuantizeOnData& fqOnData1, const ov::Shape& inputShape2, - const FakeQuantizeOnData& fqOnData2) { + const FakeQuantizeOnData& fqOnData2, + const bool requantization) { const std::shared_ptr input1 = std::make_shared(precision, inputShape1); input1->set_friendly_name("input1"); const std::shared_ptr input2 = std::make_shared(precision, inputShape2); input2->set_friendly_name("input2"); - const std::shared_ptr matMul = std::make_shared( + std::shared_ptr parent = std::make_shared( makeFakeQuantize(input1, precision, fqOnData1), makeFakeQuantize(input2, precision, fqOnData2), false, false); - matMul->set_friendly_name("matMul"); + parent->set_friendly_name("matMul"); - std::shared_ptr result = std::make_shared(matMul); + if (requantization) { + parent = makeFakeQuantize(parent, precision, fqOnData1); + parent = std::make_shared( + parent, + std::make_shared(ov::element::f32, Shape{1}, std::vector{0.f})); + parent->set_friendly_name("prelu"); + } + + std::shared_ptr result = std::make_shared(parent); std::shared_ptr function = std::make_shared( ov::ResultVector{ result },