From e9400c5f1f451a686a9365027a024a7501670890 Mon Sep 17 00:00:00 2001 From: Gorokhov Dmitriy Date: Thu, 19 Dec 2024 19:20:04 +0400 Subject: [PATCH] [CPU] Enable compressed FC via oneDNN Matmul primitive (#27459) ### Details: - This PR enables execution FullyConnected operations via OneDNN Matmul Primitive - Matmul_weights_decompression tests are splitted on x64 and arm instances, ARM tests run well via ref matmul. - Newly added functionality is still under debug caps. To try it out: -- Build OV with: -DENABLE_DEBUG_CAPS=ON cmake option -- export OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC=1 --- src/plugins/intel_cpu/src/cpu_memory.cpp | 2 +- .../intel_cpu/src/dnnl_postops_composer.cpp | 68 +++++- .../intel_cpu/src/dnnl_postops_composer.h | 15 +- .../src/nodes/common/cpu_convert.cpp | 13 +- .../dnnl/dnnl_fullyconnected_primitive.cpp | 16 +- .../executors/dnnl/dnnl_matmul_primitive.cpp | 90 +++++--- .../executors/dnnl/dnnl_matmul_primitive.hpp | 2 + .../fullyconnected_implementations.cpp | 5 +- .../intel_cpu/src/nodes/fullyconnected.cpp | 40 +++- .../intel_cpu/src/nodes/fullyconnected.h | 2 + .../convert_to_cpu_specific_opset.hpp | 25 +-- .../aarch64/pass/snippets_mark_skipped.cpp | 16 ++ .../transformation_pipeline.cpp | 21 +- .../src/arm/matmul_weights_decompression.cpp | 86 ++++++++ .../classes/matmul_weights_decompression.cpp | 167 ++++++++++++++ .../classes/matmul_weights_decompression.hpp | 79 +++++++ .../src/x64/matmul_weights_decompression.cpp | 204 +----------------- .../skip_tests_config.cpp | 1 - 18 files changed, 574 insertions(+), 278 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp index 7cb4abc2161f14..71851c529c6095 100644 --- a/src/plugins/intel_cpu/src/cpu_memory.cpp +++ b/src/plugins/intel_cpu/src/cpu_memory.cpp @@ -45,7 +45,7 @@ void transferData(const IMemory& src, const IMemory& dst, bool ftz) { if (!ftz) { return; } - if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() == ov::element::bf16) { + if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() != ov::element::f32) { return; } size_t offset = 0; diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp index 9b86a1433acb06..be0c8a2a62d954 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp @@ -628,13 +628,71 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr, auto srcMem = std::make_shared(engine, srcMemoryDesc, paramsPtr->getData()); dstMem->load(*srcMem); - return dstMem; } +static dnnl::memory::dims getGroupDims(const VectorDims& weiDims, const VectorDims& scaleDims) { + if (scaleDims[0] == 1 && scaleDims[1] == 1) + return {}; + + int N = weiDims[weiDims.size() - 2]; + int K = weiDims[weiDims.size() - 1]; + dnnl::memory::dim groupN = N / scaleDims[0]; + dnnl::memory::dim groupK = K / scaleDims[1]; + + return {groupK, groupN}; +} + +static int getMask(const VectorDims& weiDims, const dnnl::memory::dims& groupDims) { + const int maskN = 1 << (weiDims.size() - 1); + const int maskK = 1 << (weiDims.size() - 2); + int N = weiDims[weiDims.size() - 2]; + int K = weiDims[weiDims.size() - 1]; + int mask = 0; + if (!groupDims.empty() && groupDims[1] != N) + mask += maskN; + if (!groupDims.empty() && groupDims[0] != K) + mask += maskK; + + return mask; +} + void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, - ov::element::Type dstPrecision) { + ov::element::Type dstPrecision, + const VectorDims& weiDims) { + if (scales_ptr == nullptr) + return; + + auto scaleMem = prepackDecompressionParams(scales_ptr, needTranspose, dstPrecision, engine); + auto groupDims = getGroupDims(weiDims, scaleMem->getStaticDims()); + auto mask = getMask(weiDims, groupDims); + + attr.set_scales(DNNL_ARG_WEIGHTS, mask, groupDims, DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = std::move(scaleMem); + dnnlArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = + cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS]->getPrimitive(); +} + +void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, + bool needTranspose, + ov::element::Type dstPrecision, + const VectorDims& weiDims) { + if (zero_points_ptr == nullptr) + return; + + auto zeroPointsMem = prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine); + auto groupDims = getGroupDims(weiDims, zeroPointsMem->getStaticDims()); + auto mask = getMask(weiDims, groupDims); + + attr.set_zero_points(DNNL_ARG_WEIGHTS, mask, groupDims, DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + cpuArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem; + dnnlArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem->getPrimitive(); +} + +void DnnlPostOpsComposer::appendDecompressionScalesLegacy(const MemoryCPtr& scales_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (scales_ptr == nullptr) return; @@ -647,9 +705,9 @@ void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS]->getPrimitive(); } -void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, - bool needTranspose, - ov::element::Type dstPrecision) { +void DnnlPostOpsComposer::appendDecompressionZeroPointsLegacy(const MemoryCPtr& zero_points_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (zero_points_ptr == nullptr) return; diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.h b/src/plugins/intel_cpu/src/dnnl_postops_composer.h index 7ae634658b005f..81fd1aaeed194d 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.h +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.h @@ -30,10 +30,21 @@ class DnnlPostOpsComposer { const MemoryArgs& memory, const dnnl::memory::data_type outDataType); DnnlPrimitiveAttrs compose(); - void appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision); + + void appendDecompressionScales(const MemoryCPtr& scales_ptr, + bool needTranspose, + ov::element::Type dstPrecision, + const VectorDims& weiDims); void appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, bool needTranspose, - ov::element::Type dstPrecision); + ov::element::Type dstPrecision, + const VectorDims& weiDims); + void appendDecompressionScalesLegacy(const MemoryCPtr& scales_ptr, + bool needTranspose, + ov::element::Type dstPrecision); + void appendDecompressionZeroPointsLegacy(const MemoryCPtr& zero_points_ptr, + bool needTranspose, + ov::element::Type dstPrecision); void setDynamicQuantizationParams(uint64_t groupSize); private: diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp index a0590827006eb4..0c8cddd905dc2e 100644 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp @@ -570,12 +570,13 @@ struct ConvertFromBinPrecision> { } }; -#define INTEL_CPU_CVT_FROM_4BIT_LIST \ - INTEL_CPU_CVT(u4, f32), INTEL_CPU_CVT(u4, bf16), INTEL_CPU_CVT(u4, f16), INTEL_CPU_CVT(u4, i8), \ - INTEL_CPU_CVT(u4, u8), INTEL_CPU_CVT(i4, f32), INTEL_CPU_CVT(i4, bf16), INTEL_CPU_CVT(i4, f16), \ - INTEL_CPU_CVT(i4, i8), INTEL_CPU_CVT(i4, u8), INTEL_CPU_CVT(nf4, f32), INTEL_CPU_CVT(nf4, bf16), \ - INTEL_CPU_CVT(nf4, f16), INTEL_CPU_CVT(nf4, i8), INTEL_CPU_CVT(nf4, u8), INTEL_CPU_CVT(f4e2m1, f32), \ - INTEL_CPU_CVT(f4e2m1, bf16), INTEL_CPU_CVT(f4e2m1, f16), INTEL_CPU_CVT(f4e2m1, i8), INTEL_CPU_CVT(f4e2m1, u8) +#define INTEL_CPU_CVT_FROM_4BIT_LIST \ + INTEL_CPU_CVT(u4, f32), INTEL_CPU_CVT(u4, i32), INTEL_CPU_CVT(u4, bf16), INTEL_CPU_CVT(u4, f16), \ + INTEL_CPU_CVT(u4, i8), INTEL_CPU_CVT(u4, u8), INTEL_CPU_CVT(i4, f32), INTEL_CPU_CVT(i4, i32), \ + INTEL_CPU_CVT(i4, bf16), INTEL_CPU_CVT(i4, f16), INTEL_CPU_CVT(i4, i8), INTEL_CPU_CVT(i4, u8), \ + INTEL_CPU_CVT(nf4, f32), INTEL_CPU_CVT(nf4, bf16), INTEL_CPU_CVT(nf4, f16), INTEL_CPU_CVT(nf4, i8), \ + INTEL_CPU_CVT(nf4, u8), INTEL_CPU_CVT(f4e2m1, f32), INTEL_CPU_CVT(f4e2m1, bf16), INTEL_CPU_CVT(f4e2m1, f16), \ + INTEL_CPU_CVT(f4e2m1, i8), INTEL_CPU_CVT(f4e2m1, u8) struct ConvertFrom4BitContext { ov::element::Type_t inType; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index 52434a1eeb8461..8ae2d2784193af 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -228,14 +228,16 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, if (dstPrc != f8e8m0 || useDynamicQuantization) dstPrc = ov::element::f32; - dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES), !attrs.weightsNonTransposed, dstPrc); + dnnlpoc.appendDecompressionScalesLegacy(memory.at(ARG_WEI | ARG_ATTR_SCALES), + !attrs.weightsNonTransposed, + dstPrc); } if (memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) { auto dstPrc = useDynamicQuantization ? ov::element::u8 : ov::element::f32; - dnnlpoc.appendDecompressionZeroPoints(memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS), - !attrs.weightsNonTransposed, - dstPrc); + dnnlpoc.appendDecompressionZeroPointsLegacy(memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS), + !attrs.weightsNonTransposed, + dstPrc); } if (useDynamicQuantization) { @@ -247,9 +249,9 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, uint8_t zp_value = (wei_precision == ov::element::i8) ? 128 : 8; DnnlBlockedMemoryDesc zpMemoryDesc(ov::element::u8, Shape({1})); auto decompressionSubtractPtr = std::make_shared(context->getEngine(), zpMemoryDesc, &zp_value); - dnnlpoc.appendDecompressionZeroPoints(decompressionSubtractPtr, - !attrs.weightsNonTransposed, - ov::element::u8); + dnnlpoc.appendDecompressionZeroPointsLegacy(decompressionSubtractPtr, + !attrs.weightsNonTransposed, + ov::element::u8); } dnnlpoc.setDynamicQuantizationParams(attrs.dynamicQuantizationGroupSize); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp index 86b22607111833..9ffe4731689d43 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -76,6 +77,23 @@ bool DnnlMatMulPrimitive::Key::operator==(const Key& rhs) const { return result; } +template +static dimsType normalizeToRank(const dimsType& vec, size_t rank) { + if (vec.size() == rank || vec.empty()) + return vec; + + dimsType result; + result.reserve(rank); + + for (size_t i = vec.size(); i < rank; ++i) { + result.push_back(1); + } + + result.insert(result.end(), vec.begin(), vec.end()); + + return result; +} + std::shared_ptr DnnlMatMulPrimitive::create(const MemoryArgs& memory, const MatMulAttrs& attrs, const ExecutorContext::CPtr context, @@ -105,19 +123,22 @@ DnnlMemoryDescPtr DnnlMatMulPrimitive::makeTransposedWeightDescriptor(const Dnnl const auto& weiDesc = srcDesc->getDnnlDesc(); auto wDims = weiDesc.get_dims(); auto wDataType = weiDesc.get_data_type(); + std::swap(wDims[wDims.size() - 1], wDims[wDims.size() - 2]); dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims); const auto format = weightsNonTransposed ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba; const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, wDataType, format}; + const auto reshapedWeiDesc = transposedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); - return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); + return DnnlExtensionUtils::makeDescriptor(reshapedWeiDesc); } static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory, ExecutorContext::CPtr context, - bool useDynamicQuantization) { + bool useWeightsDecompression, + bool weightsNonTransposed) { const auto& srcDesc = memory.at(ARG_SRC)->getDescPtr(); const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr(); const auto& dstDesc = memory.at(ARG_DST)->getDescPtr(); @@ -132,7 +153,30 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs, DnnlPostOpsComposer dnnlpoc(postOps, context->getEngine(), dims, dims.size() - 1, isINT8, 1 << 0, memory, outputDataType); - return dnnlpoc.compose(); + const auto maxRank = + std::max({srcDesc->getShape().getRank(), weiDesc->getShape().getRank(), dstDesc->getShape().getRank()}); + const auto normWeiDims = normalizeToRank(weiDesc->getShape().getStaticDims(), maxRank); + if (memory.count(ARG_WEI | ARG_ATTR_SCALES)) { + auto dstPrc = ov::element::f32; + dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES), + !weightsNonTransposed, + dstPrc, + normWeiDims); + } + if (memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) { + // TODO: clarify oneDNN requirements on ZP precision + auto zp = memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS); + auto zpPrc = zp->getPrecision(); + auto dstPrc = one_of(zpPrc, i32, i8, u8, i4, u4) ? zpPrc : i32; + dnnlpoc.appendDecompressionZeroPoints(zp, !weightsNonTransposed, dstPrc, normWeiDims); + } + + auto primAttrs = dnnlpoc.compose(); + if (useWeightsDecompression) { + primAttrs.attr.set_fpmath_mode(fpmath_mode::any, true); + } + + return primAttrs; } static dnnl::matmul::primitive_desc createDescriptorInternal(const dnnl::memory::desc& inputDesc, @@ -143,22 +187,6 @@ static dnnl::matmul::primitive_desc createDescriptorInternal(const dnnl::memory: const dnnl::engine& engine, const bool useSparseWeights, const bool useWeightsDecompression) { - auto normalizeToRank = [](const dnnl::memory::dims& vec, size_t rank) -> dnnl::memory::dims { - if (vec.size() == rank || vec.empty()) - return vec; - - dnnl::memory::dims result; - result.reserve(rank); - - for (size_t i = vec.size(); i < rank; ++i) { - result.push_back(1); - } - - result.insert(result.end(), vec.begin(), vec.end()); - - return result; - }; - auto weiDims = weightDesc.get_dims(); std::swap(weiDims[weiDims.size() - 1], weiDims[weiDims.size() - 2]); @@ -175,7 +203,9 @@ static dnnl::matmul::primitive_desc createDescriptorInternal(const dnnl::memory: auto idt = inputDesc.get_data_type(); auto wdt = idt; - if (idt == dnnl::memory::data_type::u8 || idt == dnnl::memory::data_type::s8) { + if (useWeightsDecompression) { + wdt = weightDesc.get_data_type(); + } else if (idt == dnnl::memory::data_type::u8 || idt == dnnl::memory::data_type::s8) { wdt = memory::data_type::s8; } @@ -245,6 +275,16 @@ static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDim return outputShape; } +bool DnnlMatMulPrimitive::useWeightsDecompressionImpl(const ov::element::Type inputType, + const ov::element::Type weightsType) { +#if defined(OPENVINO_ARCH_X86_64) + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) + return false; +#endif + + return (one_of(inputType, f32, bf16, f16) && one_of(weightsType, u8, i8, u4, i4)); +} + DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory, @@ -257,7 +297,9 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAt auto dstDesc = memory.at(ARG_DST)->getDescPtr(); MatMulAttrs mmAttrs{false, false}; - const auto postOpData = createPrimitiveAttrs(mmAttrs, postOps, memory, context, false); + const auto useWeightsDecompression = useWeightsDecompressionImpl(srcDesc->getPrecision(), weiDesc->getPrecision()); + const auto postOpData = + createPrimitiveAttrs(mmAttrs, postOps, memory, context, useWeightsDecompression, attrs.weightsNonTransposed); if (!cacheWeights) return std::make_shared(postOpData); @@ -285,7 +327,7 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAt context->getEngine(), context->getImplPriorities(), false, - false); + useWeightsDecompression); const auto weightsDesc = DnnlExtensionUtils::makeDescriptor(primDesc.weights_desc()); auto originalWeightsDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weiDesc); @@ -319,7 +361,7 @@ DnnlMatMulPrimitive::DnnlMatMulPrimitive(const Key& key, engine, implPriorities, false, - false)), + useWeightsDecompressionImpl(key.src->getPrecision(), key.wei->getPrecision()))), m_implType(implTypeFromPrimDesc(m_primDesc)), m_srcDesc(DnnlExtensionUtils::makeDescriptor(m_primDesc.src_desc())), m_weiDesc(DnnlExtensionUtils::makeDescriptor(m_primDesc.weights_desc())), @@ -328,8 +370,6 @@ DnnlMatMulPrimitive::DnnlMatMulPrimitive(const Key& key, m_prim(primitive(m_primDesc)) {} void DnnlMatMulPrimitive::execute(const dnnl_primitive_args& primArgs) const { - std::cout << "Executing MM primitive" - << "\n"; m_prim.execute(m_stream, primArgs); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp index 618d3abdf8b3de..5491b62a154687 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp @@ -53,6 +53,8 @@ class DnnlMatMulPrimitive { return m_implType; } + static bool useWeightsDecompressionImpl(const ov::element::Type inputType, const ov::element::Type weightsType); + static DnnlShapeAgnosticDataPtr createShapeAgnosticData(const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory, diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index bc55af8cfbb0e2..f2cf5a7c9102b7 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -133,6 +133,8 @@ static const TypeMapping dnnlMatMulTypeMapping { // quantization configuration {{_u8 | _i8, _i8, _u8|_i8|_i32|_bf16|_f16|_f32|_undefined, _u8|_i8|_i32|_bf16|_f16|_f32}, pt(bypass(), bypass(), bypass(), bypass())}, {{_u8 | _i8, _i8, _any, _any}, pt(bypass(), bypass(), just(), just())}, + // compresses int weights + {{_f32 | _bf16 | _f16, _u8 | _i8, _any, _any}, pt(bypass(), bypass(), use<0>(), use<0>())}, // @todo should we fallback to FPXX instead of _f32? {{_any, _any, _any, _any}, pt(just(), just(), just(), just())}, // @todo explicitly cover configuration limitations for oneDNN on ARM @@ -443,7 +445,7 @@ const std::vector>& getImplementations() { return std::make_shared(attrs, postOps, memory, context); } ) - OV_CPU_INSTANCE_X64( + OV_CPU_INSTANCE_DNNL( "matmul_dnnl", ExecutorType::Dnnl, OperationType::MatMul, @@ -454,7 +456,6 @@ const std::vector>& getImplementations() { CPU_DEBUG_CAP_ENABLE( if (getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC")) { VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS); - VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION); return true; }) return false; diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 2df6c0ae7522cc..4a2e3728887087 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -41,6 +41,42 @@ namespace ov { namespace intel_cpu { namespace node { +ov::element::TypeVector FullyConnected::getSupportedCompressedWeightsTypes() { + using ov::element::Type_t; + + bool useMatmulPrim = false; + CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");) + + if (useMatmulPrim) { + return {Type_t::u8, Type_t::i8}; + } else { +#if defined(OPENVINO_ARCH_X86_64) + return {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1}; +#else + return {}; +#endif + } +} + +ov::element::TypeVector FullyConnected::getSupportedCompressedActivationsTypes() { + using ov::element::Type_t; + + bool useMatmulPrim = false; + CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");) + + if (useMatmulPrim) { + return {Type_t::f32, Type_t::f16}; + } else { +#if defined(OPENVINO_ARCH_X86_64) + // @todo enable for bf16 as well + // after EnforceInferencePrecision is replaced with ConvertPrecision + return {Type_t::f32}; +#else + return {}; +#endif + } +} + bool FullyConnected::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { @@ -113,7 +149,9 @@ bool FullyConnected::isSupportedCompressedOperation(const std::shared_ptr& model, const C CPU_REGISTER_PASS_COMMON(manager, ConvertMatMulToFC); CPU_REGISTER_PASS_COMMON(manager, FullyConnectedBiasFusion); - std::vector supported_activation_types{ - // @todo enable for bf16 as well - // after EnforceInferencePrecision is replaced with ConvertPrecision - ov::element::f32, - }; - - std::vector supported_compressed_weights_types{ - ov::element::u8, - ov::element::i8, - ov::element::u4, - ov::element::i4, - ov::element::nf4, - ov::element::f4e2m1, - }; - - CPU_REGISTER_PASS_X64( + CPU_REGISTER_PASS_COMMON( manager, pass::ConvertFullyConnectedToFullyConnectedCompressed, - supported_activation_types, - supported_compressed_weights_types, + ov::intel_cpu::node::FullyConnected::getSupportedCompressedActivationsTypes(), + ov::intel_cpu::node::FullyConnected::getSupportedCompressedWeightsTypes(), [&config](const std::shared_ptr& fc, size_t IC, size_t OC, size_t G) { return ov::intel_cpu::node::FullyConnected::isSupportedCompressedOperation(fc, IC, @@ -65,8 +50,8 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr& model, const C }); CPU_REGISTER_PASS_X64(manager, pass::ConvertFCToFCQuantizedLegacy); - CPU_REGISTER_PASS_X64(manager, MoveFCReshapeToWeights); - CPU_REGISTER_PASS_X64(manager, ov::pass::Validate); + CPU_REGISTER_PASS_COMMON(manager, MoveFCReshapeToWeights); + CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate); CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks); CPU_REGISTER_PASS_COMMON(manager, ConvertTileToSeqTiles); CPU_REGISTER_PASS_COMMON(manager, ConvertToPowerStatic); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp index 25b10d55ca8165..c567e7c38c2ef1 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp @@ -212,6 +212,13 @@ auto is_skipped_op(const std::shared_ptr& op) -> bool { return ov::is_type(op) || ov::is_type(op) || ov::is_type(op); } + +bool isSuitableMatMulWithConstantPath(const std::shared_ptr& node) { + return ov::is_type(node) && + !ov::is_type(node->get_input_node_shared_ptr(1)) && + ov::op::util::is_on_constant_path(node->input_value(1)); +} + } // namespace bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr& m) { @@ -220,6 +227,15 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr& m) { for (auto& node : m->get_ordered_ops()) { if (is_skipped_op(node)) continue; + // We perform this check separately because we mark here only weights path + // Matmul itself will be checked further + if (isSuitableMatMulWithConstantPath(node)) { + auto markup_func = [](Node* node) { + SetSnippetsNodeType(node->shared_from_this(), snippets::pass::SnippetsNodeType::SkippedByPlugin); + }; + std::unordered_set visited; + ov::op::util::visit_constant_path(node->get_input_node_ptr(1), visited, markup_func); + } if (isSuitableConvolutionParent(node)) { // Initiate fusing chain SetNodeFusingType(node, NodeFusingType::FusedWithConvolution); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 469abbd99eb149..13e890f6339e81 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -337,19 +337,14 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs); // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::TransposeMatMul); - ov::element::TypeVector decompression_precisions{ov::element::u8, - ov::element::i8, - ov::element::u4, - ov::element::i4, - ov::element::nf4, - ov::element::f4e2m1}; - - CPU_REGISTER_PASS_X64(decompression_handling_manager, - ov::pass::MarkDequantization, - decompression_precisions, - false, - true); - CPU_SET_CALLBACK_X64( + CPU_REGISTER_PASS_ARM(decompression_handling_manager, ov::pass::TransposeMatMul); + const auto& decompression_precisions = ov::intel_cpu::node::FullyConnected::getSupportedCompressedWeightsTypes(); + CPU_REGISTER_PASS_COMMON(decompression_handling_manager, + ov::pass::MarkDequantization, + decompression_precisions, + false, + true); + CPU_SET_CALLBACK_COMMON( decompression_handling_manager, [&](const_node_ptr& node) -> bool { return !is_decompression_multiply(node); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp new file mode 100644 index 00000000000000..408dd40b4c658f --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp" + +#include "openvino/util/env_util.hpp" + +using namespace CPUTestUtils; + +namespace ov { +namespace test { + +namespace { + +std::vector filter_additional_config_basic() { + return {{}, {ov::hint::inference_precision(ov::element::f16)}}; +} + +const std::vector decompression_precisions = {ov::element::f32}; +const std::vector weights_precisions = {ov::element::u8, ov::element::i8}; + +bool should_use_decompression_impl() { +#ifdef CPU_DEBUG_CAPS + return ov::util::getenv_bool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC"); +#else + return false; +#endif +} + +const std::vector input_shapes = { + {{{-1, -1, -1}, {{1, 4, 16}, {10, 16, 16}}}, {16, 32}}, + {{{}, {{1, 8, 16}}}, {16, 32}, 4ul}, + {{{}, {{1, 4, 16}}}, {1, 16, 32}}, + {{{}, {{5, 40, 96}}}, {1, 96, 240}}, + {{{}, {{1, 4, 48}}}, {48, 256}}, + {{{}, {{1, 11, 104}}}, {104, 77}, 104ul}, + {{{-1, -1, -1}, {{10, 40, 110}, {11, 40, 110}}}, {1, 110, 256}}, +}; +const std::vector fusing_params{emptyFusingSpec, fusingBias}; + +INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights, + MatmulWeightsDecompression, + ::testing::Combine(::testing::ValuesIn(input_shapes), + ::testing::ValuesIn(weights_precisions), + ::testing::ValuesIn(decompression_precisions), + ::testing::Values(ov::element::undefined), + ::testing::Values(true), + ::testing::Values(DecompressionSubtractType::full), + ::testing::Values(false), + ::testing::ValuesIn(filter_additional_config_basic()), + ::testing::ValuesIn(fusing_params), + ::testing::Values(should_use_decompression_impl())), + MatmulWeightsDecompression::getTestCaseName); + +const std::vector input_shapes_corner_cases = { + {{{-1, -1, -1}, {{1, 4, 16}}}, {1, 16, 32}}, + {{{-1, -1, -1}, {{1, 4, 16}}}, {16, 32}}, + {{{-1, -1, -1}, {{1, 5, 16}}}, {16, 32}, 4ul}, + {{{-1, -1, -1}, {{1, 1, 128}}}, {128, 128}, 16ul}, +}; + +const std::vector transpose_weights = {true, false}; +const std::vector decompression_subtract_type = {DecompressionSubtractType::full, + DecompressionSubtractType::scalar, + DecompressionSubtractType::empty}; +const std::vector reshape_on_decompression = {true, false}; +const std::vector decompression_precisions_corner_cases = {ov::element::f16, ov::element::f32}; + +INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_corner_cases, + MatmulWeightsDecompression, + ::testing::Combine(::testing::ValuesIn(input_shapes_corner_cases), + ::testing::ValuesIn(weights_precisions), + ::testing::ValuesIn(decompression_precisions_corner_cases), + ::testing::Values(ov::element::undefined), + ::testing::ValuesIn(transpose_weights), + ::testing::ValuesIn(decompression_subtract_type), + ::testing::ValuesIn(reshape_on_decompression), + ::testing::ValuesIn(filter_additional_config_basic()), + ::testing::Values(emptyFusingSpec), + ::testing::Values(should_use_decompression_impl())), + MatmulWeightsDecompression::getTestCaseName); + +} // namespace +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp new file mode 100644 index 00000000000000..e14245f2906e16 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp @@ -0,0 +1,167 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "matmul_weights_decompression.hpp" +#include "openvino/runtime/intel_cpu/properties.hpp" + +using namespace CPUTestUtils; + +namespace ov { +namespace test { + +std::string MatmulWeightsDecompression::getTestCaseName(testing::TestParamInfo obj) { + MatMulDecompressionShapeParams shape_params; + ov::test::ElementType weights_precision; + ov::test::ElementType decompression_precision; + ov::test::ElementType scale_precision; + bool transpose; + DecompressionSubtractType decompression_subtract_type; + bool reshape_on_decompression; + ov::AnyMap additional_config; + fusingSpecificParams fusing_params; + bool should_fuse; + + std::tie(shape_params, + weights_precision, + decompression_precision, + scale_precision, + transpose, + decompression_subtract_type, + reshape_on_decompression, + additional_config, + fusing_params, + should_fuse) = obj.param; + + std::ostringstream result; + result << shape_params << "_"; + result << "weights_precision=" << weights_precision << "_"; + result << "decompression_precision=" << decompression_precision << "_"; + result << "scale_precision=" << scale_precision << "_"; + result << "transpose_weights=" << transpose << "_"; + result << "decompression_subtract=" << decompression_subtract_type << "_"; + result << "reshape_on_decompression=" << reshape_on_decompression << "_"; + + result << "config=("; + for (const auto& configEntry : additional_config) { + result << configEntry.first << ", " << configEntry.second.as() << "_"; + } + result << ")"; + result << CpuTestWithFusing::getTestCaseName(fusing_params); + + return result.str(); +} + +std::shared_ptr MatmulWeightsDecompression::initSubgraph(const ov::PartialShape& data_shape, + const ov::Shape& weights_shape, + const int group_size, + const ov::element::Type data_precision, + const ov::element::Type weights_precision, + const ov::element::Type decompression_precision, + const ov::element::Type scale_precision, + const bool transpose_weights, + const DecompressionSubtractType decompression_subtract_type, + const bool reshape_on_decompression) { + ov::ParameterVector params{std::make_shared(data_precision, data_shape)}; + const auto weights_subgraph = initMatMulDecompressionSubgraph(weights_shape, + group_size, + data_precision, + weights_precision, + decompression_precision, + scale_precision, + transpose_weights, + decompression_subtract_type, + reshape_on_decompression); + auto matMul = std::make_shared(params[0], weights_subgraph); + return makeNgraphFunction(data_precision, params, matMul, "MatmulWeightsDecompression"); +} + +void MatmulWeightsDecompression::SetUp() { + targetDevice = ov::test::utils::DEVICE_CPU; + + MatMulDecompressionShapeParams shape_params; + ov::test::ElementType weights_precision; + ov::test::ElementType decompression_precision; + ov::test::ElementType scale_precision; + bool transpose_weights; + DecompressionSubtractType decompression_subtract_type; + bool reshape_on_decompression; + ov::AnyMap additional_config; + fusingSpecificParams fusing_params; + bool should_fuse; + + std::tie(shape_params, + weights_precision, + decompression_precision, + scale_precision, + transpose_weights, + decompression_subtract_type, + reshape_on_decompression, + additional_config, + fusing_params, + should_fuse) = GetParam(); + + configuration.insert(additional_config.begin(), additional_config.end()); + std::tie(postOpMgrPtr, fusedOps) = fusing_params; + init_input_shapes({shape_params.data_shape}); + + if (!configuration.count(ov::hint::dynamic_quantization_group_size.name())) { + abs_threshold = 5e-3; + } + + // if dynamic quantization is enabled + if (configuration.count(ov::hint::dynamic_quantization_group_size.name()) && + configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0) { + abs_threshold = 0.1; + } + + if (configuration.count(ov::hint::inference_precision.name()) && + configuration.at(ov::hint::inference_precision.name()) == ov::element::f16) { + abs_threshold = 0.2; + } + + ElementType netType = ov::element::f32; + inType = outType = netType; + + function = initSubgraph(inputDynamicShapes[0], + shape_params.weights_shape, + shape_params.decompression_group_size, + netType, + weights_precision, + decompression_precision, + scale_precision, + transpose_weights, + decompression_subtract_type, + reshape_on_decompression); +} + +void MatmulWeightsDecompression::check_results() { + const auto& test_param = GetParam(); + const ov::element::Type compressed_weights_precision = std::get<1>(test_param); + const bool use_matmul_decompression_impl = std::get<9>(test_param); + + const auto runtime_model = compiledModel.get_runtime_model(); + const auto result = runtime_model->get_result(); + auto fc = result->get_input_node_shared_ptr(0); + // Handle precision conversion before output + auto type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); + if (type == "Reorder" || type == "Convert" || type == "Subgraph") + fc = fc->get_input_node_shared_ptr(0); + + type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); + EXPECT_EQ(type, "FullyConnected"); + + const auto& expected_weights_precision = use_matmul_decompression_impl + ? compressed_weights_precision + : fc->get_input_element_type(0); + EXPECT_EQ(fc->get_input_element_type(1), expected_weights_precision); +} + +TEST_P(MatmulWeightsDecompression, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + run(); + check_results(); +} + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp new file mode 100644 index 00000000000000..266aab8e445928 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp @@ -0,0 +1,79 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/ov_tensor_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "shared_test_classes/subgraph/weights_decompression_builders.hpp" +#include "utils/cpu_test_utils.hpp" +#include "utils/fusing_test_utils.hpp" + +using namespace CPUTestUtils; + +namespace ov { +namespace test { + +/* + * WP - weights precision + * DP - decompression precision + * IP - input precision + * SP - scale precision + * Opt - optional + * Subtract_const(WP) + * / + * Weights(WP) Convert(DP) + * | / Multiply_const(SP) + * Convert(DP) Reshape (Opt) / + * \ / Convert(if SP != DP) + * Subtract(Opt) / + * \ Reshape (Opt) + * \ / + * Multiply + * | + * Reshape (in case of group decompression) + * | + * Convert (if IP != DP) + * | + * Data(IP) Transpose(Opt) + * \ / + * Matmul + * | + * Bias + */ +typedef std::tuple // should use decompression implementation + MatmulWeightsDecompressionParams; + +class MatmulWeightsDecompression : public testing::WithParamInterface, + virtual public SubgraphBaseTest, + public CpuTestWithFusing { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + std::shared_ptr initSubgraph(const ov::PartialShape& data_shape, + const ov::Shape& weights_shape, + const int group_size, + const ov::element::Type data_precision, + const ov::element::Type weights_precision, + const ov::element::Type decompression_precision, + const ov::element::Type scale_precision, + const bool transpose_weights, + const DecompressionSubtractType decompression_subtract_type, + const bool reshape_on_decompression); + + void SetUp() override; + + void check_results(); +}; + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp index 9a434943893eed..5a5a375566b955 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp @@ -1,200 +1,13 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "shared_test_classes/base/ov_subgraph.hpp" -#include "utils/fusing_test_utils.hpp" -#include "openvino/runtime/intel_cpu/properties.hpp" -#include "shared_test_classes/subgraph/weights_decompression_builders.hpp" +#include "custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp" using namespace CPUTestUtils; namespace ov { namespace test { -/* - * WP - weights precision - * DP - decompression precision - * IP - input precision - * SP - scale precision - * Opt - optional - * Subtract_const(WP) - * / - * Weights(WP) Convert(DP) - * | / Multiply_const(SP) - * Convert(DP) Reshape (Opt) / - * \ / Convert(if SP != DP) - * Subtract(Opt) / - * \ Reshape (Opt) - * \ / - * Multiply - * | - * Reshape (in case of group decompression) - * | - * Convert (if IP != DP) - * | - * Data(IP) Transpose(Opt) - * \ / - * Matmul - * | - * Bias - */ -using MatmulWeightsDecompressionParams = std::tuple; // should use decompression implementation - -class MatmulWeightsDecompression : public testing::WithParamInterface, - virtual public SubgraphBaseTest, - public CpuTestWithFusing { -public: - static std::string getTestCaseName(testing::TestParamInfo obj) { - MatMulDecompressionShapeParams shape_params; - ov::test::ElementType weights_precision; - ov::test::ElementType decompression_precision; - ov::test::ElementType scale_precision; - bool transpose; - DecompressionSubtractType decompression_subtract_type; - bool reshape_on_decompression; - ov::AnyMap additional_config; - fusingSpecificParams fusing_params; - bool should_fuse; - - std::tie(shape_params, - weights_precision, - decompression_precision, - scale_precision, - transpose, - decompression_subtract_type, - reshape_on_decompression, - additional_config, - fusing_params, - should_fuse) = obj.param; - - std::ostringstream result; - result << shape_params << "_"; - result << "weights_precision=" << weights_precision << "_"; - result << "decompression_precision=" << decompression_precision << "_"; - result << "scale_precision=" << scale_precision << "_"; - result << "transpose_weights=" << transpose << "_"; - result << "decompression_subtract=" << decompression_subtract_type << "_"; - result << "reshape_on_decompression=" << reshape_on_decompression << "_"; - - result << "config=("; - for (const auto& configEntry : additional_config) { - result << configEntry.first << ", " << configEntry.second.as() << "_"; - } - result << ")"; - result << CpuTestWithFusing::getTestCaseName(fusing_params); - - return result.str(); - } - -protected: - std::shared_ptr initSubgraph(const ov::PartialShape& data_shape, - const ov::Shape& weights_shape, - const int group_size, - const ov::element::Type data_precision, - const ov::element::Type weights_precision, - const ov::element::Type decompression_precision, - const ov::element::Type scale_precision, - const bool transpose_weights, - const DecompressionSubtractType decompression_subtract_type, - const bool reshape_on_decompression) { - ov::ParameterVector params{std::make_shared(data_precision, data_shape)}; - const auto weights_subgraph = initMatMulDecompressionSubgraph(weights_shape, - group_size, - data_precision, - weights_precision, - decompression_precision, - scale_precision, - transpose_weights, - decompression_subtract_type, - reshape_on_decompression); - auto matMul = std::make_shared(params[0], weights_subgraph); - return makeNgraphFunction(data_precision, params, matMul, "MatmulWeightsDecompression"); - } - - void SetUp() override { - targetDevice = ov::test::utils::DEVICE_CPU; - - MatMulDecompressionShapeParams shape_params; - ov::test::ElementType weights_precision; - ov::test::ElementType decompression_precision; - ov::test::ElementType scale_precision; - bool transpose_weights; - DecompressionSubtractType decompression_subtract_type; - bool reshape_on_decompression; - ov::AnyMap additional_config; - fusingSpecificParams fusing_params; - bool should_fuse; - - std::tie(shape_params, - weights_precision, - decompression_precision, - scale_precision, - transpose_weights, - decompression_subtract_type, - reshape_on_decompression, - additional_config, - fusing_params, - should_fuse) = GetParam(); - - configuration.insert(additional_config.begin(), additional_config.end()); - std::tie(postOpMgrPtr, fusedOps) = fusing_params; - init_input_shapes({shape_params.data_shape}); - - // if dynamic quantization is enabled - if (configuration.count(ov::hint::dynamic_quantization_group_size.name()) && - configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0) { - abs_threshold = 0.1; - } else if (!configuration.count(ov::hint::dynamic_quantization_group_size.name())) { - abs_threshold = 5e-3; - } - - ElementType netType = ov::element::f32; - inType = outType = netType; - - function = initSubgraph(inputDynamicShapes[0], - shape_params.weights_shape, - shape_params.decompression_group_size, - netType, - weights_precision, - decompression_precision, - scale_precision, - transpose_weights, - decompression_subtract_type, - reshape_on_decompression); - } - - void check_results() { - const auto& test_param = GetParam(); - const ov::element::Type compressed_weights_precision = std::get<1>(test_param); - const bool use_matmul_decompression_impl = std::get<9>(test_param); - - const auto runtime_model = compiledModel.get_runtime_model(); - const auto result = runtime_model->get_result(); - const auto fc = result->get_input_node_shared_ptr(0); - const auto type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); - EXPECT_EQ(type, "FullyConnected"); - - const auto& expected_weights_precision = use_matmul_decompression_impl - ? compressed_weights_precision - : fc->get_input_element_type(0); - EXPECT_EQ(fc->get_input_element_type(1), expected_weights_precision); - } -}; - -TEST_P(MatmulWeightsDecompression, CompareWithRefs) { - SKIP_IF_CURRENT_TEST_IS_DISABLED() - run(); - check_results(); -} namespace { @@ -205,7 +18,8 @@ std::vector filter_additional_config_basic() { std::vector filter_additional_config_amx() { std::vector additional_config = {}; if (ov::with_cpu_x86_avx512_core_amx()) - additional_config.push_back({{ov::hint::dynamic_quantization_group_size(0), ov::hint::inference_precision(ov::element::bf16)}}); + additional_config.push_back( + {{ov::hint::dynamic_quantization_group_size(0), ov::hint::inference_precision(ov::element::bf16)}}); return additional_config; } @@ -310,8 +124,9 @@ const std::vector input_shapes_corner_cases_amx }; const std::vector transpose_weights = {true, false}; -const std::vector decompression_subtract_type = { - DecompressionSubtractType::full, DecompressionSubtractType::scalar, DecompressionSubtractType::empty}; +const std::vector decompression_subtract_type = {DecompressionSubtractType::full, + DecompressionSubtractType::scalar, + DecompressionSubtractType::empty}; const std::vector reshape_on_decompression = {true, false}; const std::vector decompression_precisions_corner_cases = {ov::element::f16, ov::element::f32}; @@ -387,12 +202,11 @@ const std::vector input_shapes_basic_dyn_quant = {{{}, {{1, 1, 1728}}}, {1728, 128}, 64lu}, }; -const std::vector weights_precisions_dyn_quant = {ov::element::u8, - ov::element::u4}; +const std::vector weights_precisions_dyn_quant = {ov::element::u8, ov::element::u4}; std::vector filter_additional_config_dyn_quant() { std::vector additional_config = { - {{ov::hint::dynamic_quantization_group_size(0)}}, // dynamic quantization is disabled + {{ov::hint::dynamic_quantization_group_size(0)}}, // dynamic quantization is disabled {{ov::hint::dynamic_quantization_group_size(16)}}, {{ov::hint::dynamic_quantization_group_size(128)}}, }; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 4c21c06c491179..7af707df602bfc 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -481,7 +481,6 @@ std::vector disabledTestPatterns() { // Issue 88764, 91647, 108802: accuracy issue retVector.emplace_back(R"(MultipleLSTMCellTest/MultipleLSTMCellTest.CompareWithRefs.*)"); // Compressed weights are not supported - retVector.emplace_back(R"(smoke_MatMulCompressedWeights.*)"); retVector.emplace_back(R"(smoke_MatMulSharedCompressedWeights.*)"); retVector.emplace_back(R"(smoke_MatmulAndGatherSharedWeightsDecompression.*)"); // smoke_Snippets test cases are not supported on arm32 platforms