From e9400c5f1f451a686a9365027a024a7501670890 Mon Sep 17 00:00:00 2001
From: Gorokhov Dmitriy <dmitry.gorokhov@intel.com>
Date: Thu, 19 Dec 2024 19:20:04 +0400
Subject: [PATCH] [CPU] Enable compressed FC via oneDNN Matmul primitive
 (#27459)

### Details:
- This PR enables execution FullyConnected operations via OneDNN Matmul
Primitive
- Matmul_weights_decompression tests are splitted on x64 and arm
instances, ARM tests run well via ref matmul.
- Newly added functionality is still under debug caps. To try it out:
-- Build OV with: -DENABLE_DEBUG_CAPS=ON cmake option
-- export OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC=1
---
 src/plugins/intel_cpu/src/cpu_memory.cpp      |   2 +-
 .../intel_cpu/src/dnnl_postops_composer.cpp   |  68 +++++-
 .../intel_cpu/src/dnnl_postops_composer.h     |  15 +-
 .../src/nodes/common/cpu_convert.cpp          |  13 +-
 .../dnnl/dnnl_fullyconnected_primitive.cpp    |  16 +-
 .../executors/dnnl/dnnl_matmul_primitive.cpp  |  90 +++++---
 .../executors/dnnl/dnnl_matmul_primitive.hpp  |   2 +
 .../fullyconnected_implementations.cpp        |   5 +-
 .../intel_cpu/src/nodes/fullyconnected.cpp    |  40 +++-
 .../intel_cpu/src/nodes/fullyconnected.h      |   2 +
 .../convert_to_cpu_specific_opset.hpp         |  25 +--
 .../aarch64/pass/snippets_mark_skipped.cpp    |  16 ++
 .../transformation_pipeline.cpp               |  21 +-
 .../src/arm/matmul_weights_decompression.cpp  |  86 ++++++++
 .../classes/matmul_weights_decompression.cpp  | 167 ++++++++++++++
 .../classes/matmul_weights_decompression.hpp  |  79 +++++++
 .../src/x64/matmul_weights_decompression.cpp  | 204 +-----------------
 .../skip_tests_config.cpp                     |   1 -
 18 files changed, 574 insertions(+), 278 deletions(-)
 create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp
diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp
index 7cb4abc2161f14..71851c529c6095 100644
--- a/src/plugins/intel_cpu/src/cpu_memory.cpp
+++ b/src/plugins/intel_cpu/src/cpu_memory.cpp
@@ -45,7 +45,7 @@ void transferData(const IMemory& src, const IMemory& dst, bool ftz) {
     if (!ftz) {
         return;
     }
-    if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() == ov::element::bf16) {
+    if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() != ov::element::f32) {
         return;
     }
     size_t offset = 0;
diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
index 9b86a1433acb06..be0c8a2a62d954 100644
--- a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
+++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
@@ -628,13 +628,71 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr,
     auto srcMem = std::make_shared<Memory>(engine, srcMemoryDesc, paramsPtr->getData());
 
     dstMem->load(*srcMem);
-
     return dstMem;
 }
 
+static dnnl::memory::dims getGroupDims(const VectorDims& weiDims, const VectorDims& scaleDims) {
+    if (scaleDims[0] == 1 && scaleDims[1] == 1)
+        return {};
+
+    int N = weiDims[weiDims.size() - 2];
+    int K = weiDims[weiDims.size() - 1];
+    dnnl::memory::dim groupN = N / scaleDims[0];
+    dnnl::memory::dim groupK = K / scaleDims[1];
+
+    return {groupK, groupN};
+}
+
+static int getMask(const VectorDims& weiDims, const dnnl::memory::dims& groupDims) {
+    const int maskN = 1 << (weiDims.size() - 1);
+    const int maskK = 1 << (weiDims.size() - 2);
+    int N = weiDims[weiDims.size() - 2];
+    int K = weiDims[weiDims.size() - 1];
+    int mask = 0;
+    if (!groupDims.empty() && groupDims[1] != N)
+        mask += maskN;
+    if (!groupDims.empty() && groupDims[0] != K)
+        mask += maskK;
+
+    return mask;
+}
+
 void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr,
                                                     bool needTranspose,
-                                                    ov::element::Type dstPrecision) {
+                                                    ov::element::Type dstPrecision,
+                                                    const VectorDims& weiDims) {
+    if (scales_ptr == nullptr)
+        return;
+
+    auto scaleMem = prepackDecompressionParams(scales_ptr, needTranspose, dstPrecision, engine);
+    auto groupDims = getGroupDims(weiDims, scaleMem->getStaticDims());
+    auto mask = getMask(weiDims, groupDims);
+
+    attr.set_scales(DNNL_ARG_WEIGHTS, mask, groupDims, DnnlExtensionUtils::ElementTypeToDataType(dstPrecision));
+    cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = std::move(scaleMem);
+    dnnlArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
+        cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS]->getPrimitive();
+}
+
+void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr,
+                                                        bool needTranspose,
+                                                        ov::element::Type dstPrecision,
+                                                        const VectorDims& weiDims) {
+    if (zero_points_ptr == nullptr)
+        return;
+
+    auto zeroPointsMem = prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine);
+    auto groupDims = getGroupDims(weiDims, zeroPointsMem->getStaticDims());
+    auto mask = getMask(weiDims, groupDims);
+
+    attr.set_zero_points(DNNL_ARG_WEIGHTS, mask, groupDims, DnnlExtensionUtils::ElementTypeToDataType(dstPrecision));
+    cpuArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem;
+    dnnlArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem->getPrimitive();
+}
+
+void DnnlPostOpsComposer::appendDecompressionScalesLegacy(const MemoryCPtr& scales_ptr,
+                                                          bool needTranspose,
+                                                          ov::element::Type dstPrecision) {
     if (scales_ptr == nullptr)
         return;
 
@@ -647,9 +705,9 @@ void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr
         cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS]->getPrimitive();
 }
 
-void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr,
-                                                        bool needTranspose,
-                                                        ov::element::Type dstPrecision) {
+void DnnlPostOpsComposer::appendDecompressionZeroPointsLegacy(const MemoryCPtr& zero_points_ptr,
+                                                              bool needTranspose,
+                                                              ov::element::Type dstPrecision) {
     if (zero_points_ptr == nullptr)
         return;
 
diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.h b/src/plugins/intel_cpu/src/dnnl_postops_composer.h
index 7ae634658b005f..81fd1aaeed194d 100644
--- a/src/plugins/intel_cpu/src/dnnl_postops_composer.h
+++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.h
@@ -30,10 +30,21 @@ class DnnlPostOpsComposer {
                         const MemoryArgs& memory,
                         const dnnl::memory::data_type outDataType);
     DnnlPrimitiveAttrs compose();
-    void appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision);
+
+    void appendDecompressionScales(const MemoryCPtr& scales_ptr,
+                                   bool needTranspose,
+                                   ov::element::Type dstPrecision,
+                                   const VectorDims& weiDims);
     void appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr,
                                        bool needTranspose,
-                                       ov::element::Type dstPrecision);
+                                       ov::element::Type dstPrecision,
+                                       const VectorDims& weiDims);
+    void appendDecompressionScalesLegacy(const MemoryCPtr& scales_ptr,
+                                         bool needTranspose,
+                                         ov::element::Type dstPrecision);
+    void appendDecompressionZeroPointsLegacy(const MemoryCPtr& zero_points_ptr,
+                                             bool needTranspose,
+                                             ov::element::Type dstPrecision);
     void setDynamicQuantizationParams(uint64_t groupSize);
 
 private:
diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp
index a0590827006eb4..0c8cddd905dc2e 100644
--- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp
+++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp
@@ -570,12 +570,13 @@ struct ConvertFromBinPrecision<std::tuple<src_t, dst_t>> {
     }
 };
 
-#define INTEL_CPU_CVT_FROM_4BIT_LIST                                                                         \
-    INTEL_CPU_CVT(u4, f32), INTEL_CPU_CVT(u4, bf16), INTEL_CPU_CVT(u4, f16), INTEL_CPU_CVT(u4, i8),          \
-        INTEL_CPU_CVT(u4, u8), INTEL_CPU_CVT(i4, f32), INTEL_CPU_CVT(i4, bf16), INTEL_CPU_CVT(i4, f16),      \
-        INTEL_CPU_CVT(i4, i8), INTEL_CPU_CVT(i4, u8), INTEL_CPU_CVT(nf4, f32), INTEL_CPU_CVT(nf4, bf16),     \
-        INTEL_CPU_CVT(nf4, f16), INTEL_CPU_CVT(nf4, i8), INTEL_CPU_CVT(nf4, u8), INTEL_CPU_CVT(f4e2m1, f32), \
-        INTEL_CPU_CVT(f4e2m1, bf16), INTEL_CPU_CVT(f4e2m1, f16), INTEL_CPU_CVT(f4e2m1, i8), INTEL_CPU_CVT(f4e2m1, u8)
+#define INTEL_CPU_CVT_FROM_4BIT_LIST                                                                                 \
+    INTEL_CPU_CVT(u4, f32), INTEL_CPU_CVT(u4, i32), INTEL_CPU_CVT(u4, bf16), INTEL_CPU_CVT(u4, f16),                 \
+        INTEL_CPU_CVT(u4, i8), INTEL_CPU_CVT(u4, u8), INTEL_CPU_CVT(i4, f32), INTEL_CPU_CVT(i4, i32),                \
+        INTEL_CPU_CVT(i4, bf16), INTEL_CPU_CVT(i4, f16), INTEL_CPU_CVT(i4, i8), INTEL_CPU_CVT(i4, u8),               \
+        INTEL_CPU_CVT(nf4, f32), INTEL_CPU_CVT(nf4, bf16), INTEL_CPU_CVT(nf4, f16), INTEL_CPU_CVT(nf4, i8),          \
+        INTEL_CPU_CVT(nf4, u8), INTEL_CPU_CVT(f4e2m1, f32), INTEL_CPU_CVT(f4e2m1, bf16), INTEL_CPU_CVT(f4e2m1, f16), \
+        INTEL_CPU_CVT(f4e2m1, i8), INTEL_CPU_CVT(f4e2m1, u8)
 
 struct ConvertFrom4BitContext {
     ov::element::Type_t inType;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
index 52434a1eeb8461..8ae2d2784193af 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
@@ -228,14 +228,16 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs,
         if (dstPrc != f8e8m0 || useDynamicQuantization)
             dstPrc = ov::element::f32;
 
-        dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES), !attrs.weightsNonTransposed, dstPrc);
+        dnnlpoc.appendDecompressionScalesLegacy(memory.at(ARG_WEI | ARG_ATTR_SCALES),
+                                                !attrs.weightsNonTransposed,
+                                                dstPrc);
     }
 
     if (memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) {
         auto dstPrc = useDynamicQuantization ? ov::element::u8 : ov::element::f32;
-        dnnlpoc.appendDecompressionZeroPoints(memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS),
-                                              !attrs.weightsNonTransposed,
-                                              dstPrc);
+        dnnlpoc.appendDecompressionZeroPointsLegacy(memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS),
+                                                    !attrs.weightsNonTransposed,
+                                                    dstPrc);
     }
 
     if (useDynamicQuantization) {
@@ -247,9 +249,9 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs,
             uint8_t zp_value = (wei_precision == ov::element::i8) ? 128 : 8;
             DnnlBlockedMemoryDesc zpMemoryDesc(ov::element::u8, Shape({1}));
             auto decompressionSubtractPtr = std::make_shared<Memory>(context->getEngine(), zpMemoryDesc, &zp_value);
-            dnnlpoc.appendDecompressionZeroPoints(decompressionSubtractPtr,
-                                                  !attrs.weightsNonTransposed,
-                                                  ov::element::u8);
+            dnnlpoc.appendDecompressionZeroPointsLegacy(decompressionSubtractPtr,
+                                                        !attrs.weightsNonTransposed,
+                                                        ov::element::u8);
         }
         dnnlpoc.setDynamicQuantizationParams(attrs.dynamicQuantizationGroupSize);
     }
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
index 86b22607111833..9ffe4731689d43 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
@@ -10,6 +10,7 @@
 #include <common/primitive_attr.hpp>
 #include <common/primitive_desc_iface.hpp>
 #include <common/primitive_iface.hpp>
+#include <cpu/x64/cpu_isa_traits.hpp>
 #include <memory>
 #include <oneapi/dnnl/dnnl.hpp>
 #include <oneapi/dnnl/dnnl_common.hpp>
@@ -76,6 +77,23 @@ bool DnnlMatMulPrimitive::Key::operator==(const Key& rhs) const {
     return result;
 }
 
+template <typename dimsType>
+static dimsType normalizeToRank(const dimsType& vec, size_t rank) {
+    if (vec.size() == rank || vec.empty())
+        return vec;
+
+    dimsType result;
+    result.reserve(rank);
+
+    for (size_t i = vec.size(); i < rank; ++i) {
+        result.push_back(1);
+    }
+
+    result.insert(result.end(), vec.begin(), vec.end());
+
+    return result;
+}
+
 std::shared_ptr<DnnlMatMulPrimitive> DnnlMatMulPrimitive::create(const MemoryArgs& memory,
                                                                  const MatMulAttrs& attrs,
                                                                  const ExecutorContext::CPtr context,
@@ -105,19 +123,22 @@ DnnlMemoryDescPtr DnnlMatMulPrimitive::makeTransposedWeightDescriptor(const Dnnl
     const auto& weiDesc = srcDesc->getDnnlDesc();
     auto wDims = weiDesc.get_dims();
     auto wDataType = weiDesc.get_data_type();
+    std::swap(wDims[wDims.size() - 1], wDims[wDims.size() - 2]);
     dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims);
 
     const auto format = weightsNonTransposed ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba;
     const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, wDataType, format};
+    const auto reshapedWeiDesc = transposedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims());
 
-    return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc);
+    return DnnlExtensionUtils::makeDescriptor(reshapedWeiDesc);
 }
 
 static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs,
                                                const PostOps& postOps,
                                                const MemoryArgs& memory,
                                                ExecutorContext::CPtr context,
-                                               bool useDynamicQuantization) {
+                                               bool useWeightsDecompression,
+                                               bool weightsNonTransposed) {
     const auto& srcDesc = memory.at(ARG_SRC)->getDescPtr();
     const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr();
     const auto& dstDesc = memory.at(ARG_DST)->getDescPtr();
@@ -132,7 +153,30 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs,
     DnnlPostOpsComposer
         dnnlpoc(postOps, context->getEngine(), dims, dims.size() - 1, isINT8, 1 << 0, memory, outputDataType);
 
-    return dnnlpoc.compose();
+    const auto maxRank =
+        std::max({srcDesc->getShape().getRank(), weiDesc->getShape().getRank(), dstDesc->getShape().getRank()});
+    const auto normWeiDims = normalizeToRank(weiDesc->getShape().getStaticDims(), maxRank);
+    if (memory.count(ARG_WEI | ARG_ATTR_SCALES)) {
+        auto dstPrc = ov::element::f32;
+        dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES),
+                                          !weightsNonTransposed,
+                                          dstPrc,
+                                          normWeiDims);
+    }
+    if (memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) {
+        // TODO: clarify oneDNN requirements on ZP precision
+        auto zp = memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS);
+        auto zpPrc = zp->getPrecision();
+        auto dstPrc = one_of(zpPrc, i32, i8, u8, i4, u4) ? zpPrc : i32;
+        dnnlpoc.appendDecompressionZeroPoints(zp, !weightsNonTransposed, dstPrc, normWeiDims);
+    }
+
+    auto primAttrs = dnnlpoc.compose();
+    if (useWeightsDecompression) {
+        primAttrs.attr.set_fpmath_mode(fpmath_mode::any, true);
+    }
+
+    return primAttrs;
 }
 
 static dnnl::matmul::primitive_desc createDescriptorInternal(const dnnl::memory::desc& inputDesc,
@@ -143,22 +187,6 @@ static dnnl::matmul::primitive_desc createDescriptorInternal(const dnnl::memory:
                                                              const dnnl::engine& engine,
                                                              const bool useSparseWeights,
                                                              const bool useWeightsDecompression) {
-    auto normalizeToRank = [](const dnnl::memory::dims& vec, size_t rank) -> dnnl::memory::dims {
-        if (vec.size() == rank || vec.empty())
-            return vec;
-
-        dnnl::memory::dims result;
-        result.reserve(rank);
-
-        for (size_t i = vec.size(); i < rank; ++i) {
-            result.push_back(1);
-        }
-
-        result.insert(result.end(), vec.begin(), vec.end());
-
-        return result;
-    };
-
     auto weiDims = weightDesc.get_dims();
     std::swap(weiDims[weiDims.size() - 1], weiDims[weiDims.size() - 2]);
 
@@ -175,7 +203,9 @@ static dnnl::matmul::primitive_desc createDescriptorInternal(const dnnl::memory:
 
     auto idt = inputDesc.get_data_type();
     auto wdt = idt;
-    if (idt == dnnl::memory::data_type::u8 || idt == dnnl::memory::data_type::s8) {
+    if (useWeightsDecompression) {
+        wdt = weightDesc.get_data_type();
+    } else if (idt == dnnl::memory::data_type::u8 || idt == dnnl::memory::data_type::s8) {
         wdt = memory::data_type::s8;
     }
 
@@ -245,6 +275,16 @@ static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDim
     return outputShape;
 }
 
+bool DnnlMatMulPrimitive::useWeightsDecompressionImpl(const ov::element::Type inputType,
+                                                      const ov::element::Type weightsType) {
+#if defined(OPENVINO_ARCH_X86_64)
+    if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2))
+        return false;
+#endif
+
+    return (one_of(inputType, f32, bf16, f16) && one_of(weightsType, u8, i8, u4, i4));
+}
+
 DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAttrs& attrs,
                                                                       const PostOps& postOps,
                                                                       const MemoryArgs& memory,
@@ -257,7 +297,9 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAt
     auto dstDesc = memory.at(ARG_DST)->getDescPtr();
     MatMulAttrs mmAttrs{false, false};
 
-    const auto postOpData = createPrimitiveAttrs(mmAttrs, postOps, memory, context, false);
+    const auto useWeightsDecompression = useWeightsDecompressionImpl(srcDesc->getPrecision(), weiDesc->getPrecision());
+    const auto postOpData =
+        createPrimitiveAttrs(mmAttrs, postOps, memory, context, useWeightsDecompression, attrs.weightsNonTransposed);
 
     if (!cacheWeights)
         return std::make_shared<DnnlShapeAgnosticData>(postOpData);
@@ -285,7 +327,7 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAt
                                               context->getEngine(),
                                               context->getImplPriorities(),
                                               false,
-                                              false);
+                                              useWeightsDecompression);
 
     const auto weightsDesc = DnnlExtensionUtils::makeDescriptor(primDesc.weights_desc());
     auto originalWeightsDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weiDesc);
@@ -319,7 +361,7 @@ DnnlMatMulPrimitive::DnnlMatMulPrimitive(const Key& key,
                                      engine,
                                      implPriorities,
                                      false,
-                                     false)),
+                                     useWeightsDecompressionImpl(key.src->getPrecision(), key.wei->getPrecision()))),
       m_implType(implTypeFromPrimDesc(m_primDesc)),
       m_srcDesc(DnnlExtensionUtils::makeDescriptor(m_primDesc.src_desc())),
       m_weiDesc(DnnlExtensionUtils::makeDescriptor(m_primDesc.weights_desc())),
@@ -328,8 +370,6 @@ DnnlMatMulPrimitive::DnnlMatMulPrimitive(const Key& key,
       m_prim(primitive(m_primDesc)) {}
 
 void DnnlMatMulPrimitive::execute(const dnnl_primitive_args& primArgs) const {
-    std::cout << "Executing MM primitive"
-              << "\n";
     m_prim.execute(m_stream, primArgs);
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp
index 618d3abdf8b3de..5491b62a154687 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp
@@ -53,6 +53,8 @@ class DnnlMatMulPrimitive {
         return m_implType;
     }
 
+    static bool useWeightsDecompressionImpl(const ov::element::Type inputType, const ov::element::Type weightsType);
+
     static DnnlShapeAgnosticDataPtr createShapeAgnosticData(const FCAttrs& attrs,
                                                             const PostOps& postOps,
                                                             const MemoryArgs& memory,
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
index bc55af8cfbb0e2..f2cf5a7c9102b7 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -133,6 +133,8 @@ static const TypeMapping dnnlMatMulTypeMapping {
     // quantization configuration
     {{_u8 | _i8, _i8, _u8|_i8|_i32|_bf16|_f16|_f32|_undefined, _u8|_i8|_i32|_bf16|_f16|_f32}, pt(bypass(), bypass(), bypass(),  bypass())},
     {{_u8 | _i8, _i8, _any, _any},                            pt(bypass(), bypass(), just<f32>(), just<f32>())},
+    // compresses int weights
+    {{_f32 | _bf16 | _f16, _u8 | _i8, _any, _any},            pt(bypass(), bypass(), use<0>(), use<0>())},
     // @todo should we fallback to FPXX instead of _f32?
     {{_any, _any, _any, _any},                                pt(just<f32>(), just<f32>(), just<f32>(), just<f32>())},
     // @todo explicitly cover configuration limitations for oneDNN on ARM
@@ -443,7 +445,7 @@ const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
                 return std::make_shared<ShlFCExecutor>(attrs, postOps, memory, context);
             }
         )
-        OV_CPU_INSTANCE_X64(
+        OV_CPU_INSTANCE_DNNL(
             "matmul_dnnl",
             ExecutorType::Dnnl,
             OperationType::MatMul,
@@ -454,7 +456,6 @@ const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
                 CPU_DEBUG_CAP_ENABLE(
                     if (getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC")) {
                         VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS);
-                        VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION);
                         return true;
                     })
                 return false;
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 2df6c0ae7522cc..4a2e3728887087 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -41,6 +41,42 @@ namespace ov {
 namespace intel_cpu {
 namespace node {
 
+ov::element::TypeVector FullyConnected::getSupportedCompressedWeightsTypes() {
+    using ov::element::Type_t;
+
+    bool useMatmulPrim = false;
+    CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");)
+
+    if (useMatmulPrim) {
+        return {Type_t::u8, Type_t::i8};
+    } else {
+#if defined(OPENVINO_ARCH_X86_64)
+        return {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1};
+#else
+        return {};
+#endif
+    }
+}
+
+ov::element::TypeVector FullyConnected::getSupportedCompressedActivationsTypes() {
+    using ov::element::Type_t;
+
+    bool useMatmulPrim = false;
+    CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");)
+
+    if (useMatmulPrim) {
+        return {Type_t::f32, Type_t::f16};
+    } else {
+#if defined(OPENVINO_ARCH_X86_64)
+        // @todo enable for bf16 as well
+        // after EnforceInferencePrecision is replaced with ConvertPrecision
+        return {Type_t::f32};
+#else
+        return {};
+#endif
+    }
+}
+
 bool FullyConnected::isSupportedOperation(const std::shared_ptr<const ov::Node>& op,
                                           std::string& errorMessage) noexcept {
     try {
@@ -113,7 +149,9 @@ bool FullyConnected::isSupportedCompressedOperation(const std::shared_ptr<ov::No
     }
     return true;
 #else
-    return false;
+    bool useMatmulPrim = false;
+    CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");)
+    return useMatmulPrim;
 #endif
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index 0b50d882c9e554..660e420d5c58cd 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -72,6 +72,8 @@ class FullyConnected : public Node {
                                                size_t OC,
                                                size_t G,
                                                ov::element::Type inferencePrecision) noexcept;
+    static ov::element::TypeVector getSupportedCompressedWeightsTypes();
+    static ov::element::TypeVector getSupportedCompressedActivationsTypes();
 
     bool isExecutable() const override {
         return !isInputTensorAtPortEmpty(0);
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
index 9793c63de821ec..614f7d690f8726 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
@@ -36,26 +36,11 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ov::Model>& model, const C
     CPU_REGISTER_PASS_COMMON(manager, ConvertMatMulToFC);
     CPU_REGISTER_PASS_COMMON(manager, FullyConnectedBiasFusion);
 
-    std::vector<ov::element::Type> supported_activation_types{
-        // @todo enable for bf16 as well
-        // after EnforceInferencePrecision is replaced with ConvertPrecision
-        ov::element::f32,
-    };
-
-    std::vector<ov::element::Type> supported_compressed_weights_types{
-        ov::element::u8,
-        ov::element::i8,
-        ov::element::u4,
-        ov::element::i4,
-        ov::element::nf4,
-        ov::element::f4e2m1,
-    };
-
-    CPU_REGISTER_PASS_X64(
+    CPU_REGISTER_PASS_COMMON(
         manager,
         pass::ConvertFullyConnectedToFullyConnectedCompressed,
-        supported_activation_types,
-        supported_compressed_weights_types,
+        ov::intel_cpu::node::FullyConnected::getSupportedCompressedActivationsTypes(),
+        ov::intel_cpu::node::FullyConnected::getSupportedCompressedWeightsTypes(),
         [&config](const std::shared_ptr<ov::op::internal::FullyConnected>& fc, size_t IC, size_t OC, size_t G) {
             return ov::intel_cpu::node::FullyConnected::isSupportedCompressedOperation(fc,
                                                                                        IC,
@@ -65,8 +50,8 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ov::Model>& model, const C
         });
 
     CPU_REGISTER_PASS_X64(manager, pass::ConvertFCToFCQuantizedLegacy);
-    CPU_REGISTER_PASS_X64(manager, MoveFCReshapeToWeights);
-    CPU_REGISTER_PASS_X64(manager, ov::pass::Validate);
+    CPU_REGISTER_PASS_COMMON(manager, MoveFCReshapeToWeights);
+    CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate);
     CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks);
     CPU_REGISTER_PASS_COMMON(manager, ConvertTileToSeqTiles);
     CPU_REGISTER_PASS_COMMON(manager, ConvertToPowerStatic);
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp
index 25b10d55ca8165..c567e7c38c2ef1 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp
@@ -212,6 +212,13 @@ auto is_skipped_op(const std::shared_ptr<ov::Node>& op) -> bool {
     return ov::is_type<ov::op::v0::Constant>(op) || ov::is_type<ov::op::v0::Parameter>(op) ||
            ov::is_type<ov::op::v0::Result>(op);
 }
+
+bool isSuitableMatMulWithConstantPath(const std::shared_ptr<Node>& node) {
+    return ov::is_type<ov::opset1::MatMul>(node) &&
+           !ov::is_type<ov::opset1::Constant>(node->get_input_node_shared_ptr(1)) &&
+           ov::op::util::is_on_constant_path(node->input_value(1));
+}
+
 }  // namespace
 
 bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model>& m) {
@@ -220,6 +227,15 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model>& m) {
     for (auto& node : m->get_ordered_ops()) {
         if (is_skipped_op(node))
             continue;
+        // We perform this check separately because we mark here only weights path
+        // Matmul itself will be checked further
+        if (isSuitableMatMulWithConstantPath(node)) {
+            auto markup_func = [](Node* node) {
+                SetSnippetsNodeType(node->shared_from_this(), snippets::pass::SnippetsNodeType::SkippedByPlugin);
+            };
+            std::unordered_set<Node*> visited;
+            ov::op::util::visit_constant_path(node->get_input_node_ptr(1), visited, markup_func);
+        }
         if (isSuitableConvolutionParent(node)) {
             // Initiate fusing chain
             SetNodeFusingType(node, NodeFusingType::FusedWithConvolution);
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 469abbd99eb149..13e890f6339e81 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -337,19 +337,14 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs);
     // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation
     CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::TransposeMatMul);
-    ov::element::TypeVector decompression_precisions{ov::element::u8,
-                                                     ov::element::i8,
-                                                     ov::element::u4,
-                                                     ov::element::i4,
-                                                     ov::element::nf4,
-                                                     ov::element::f4e2m1};
-
-    CPU_REGISTER_PASS_X64(decompression_handling_manager,
-                          ov::pass::MarkDequantization,
-                          decompression_precisions,
-                          false,
-                          true);
-    CPU_SET_CALLBACK_X64(
+    CPU_REGISTER_PASS_ARM(decompression_handling_manager, ov::pass::TransposeMatMul);
+    const auto& decompression_precisions = ov::intel_cpu::node::FullyConnected::getSupportedCompressedWeightsTypes();
+    CPU_REGISTER_PASS_COMMON(decompression_handling_manager,
+                             ov::pass::MarkDequantization,
+                             decompression_precisions,
+                             false,
+                             true);
+    CPU_SET_CALLBACK_COMMON(
         decompression_handling_manager,
         [&](const_node_ptr& node) -> bool {
             return !is_decompression_multiply(node);
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp
new file mode 100644
index 00000000000000..408dd40b4c658f
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp
@@ -0,0 +1,86 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp"
+
+#include "openvino/util/env_util.hpp"
+
+using namespace CPUTestUtils;
+
+namespace ov {
+namespace test {
+
+namespace {
+
+std::vector<ov::AnyMap> filter_additional_config_basic() {
+    return {{}, {ov::hint::inference_precision(ov::element::f16)}};
+}
+
+const std::vector<ov::test::ElementType> decompression_precisions = {ov::element::f32};
+const std::vector<ov::test::ElementType> weights_precisions = {ov::element::u8, ov::element::i8};
+
+bool should_use_decompression_impl() {
+#ifdef CPU_DEBUG_CAPS
+    return ov::util::getenv_bool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");
+#else
+    return false;
+#endif
+}
+
+const std::vector<MatMulDecompressionShapeParams> input_shapes = {
+    {{{-1, -1, -1}, {{1, 4, 16}, {10, 16, 16}}}, {16, 32}},
+    {{{}, {{1, 8, 16}}}, {16, 32}, 4ul},
+    {{{}, {{1, 4, 16}}}, {1, 16, 32}},
+    {{{}, {{5, 40, 96}}}, {1, 96, 240}},
+    {{{}, {{1, 4, 48}}}, {48, 256}},
+    {{{}, {{1, 11, 104}}}, {104, 77}, 104ul},
+    {{{-1, -1, -1}, {{10, 40, 110}, {11, 40, 110}}}, {1, 110, 256}},
+};
+const std::vector<fusingSpecificParams> fusing_params{emptyFusingSpec, fusingBias};
+
+INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights,
+                         MatmulWeightsDecompression,
+                         ::testing::Combine(::testing::ValuesIn(input_shapes),
+                                            ::testing::ValuesIn(weights_precisions),
+                                            ::testing::ValuesIn(decompression_precisions),
+                                            ::testing::Values(ov::element::undefined),
+                                            ::testing::Values(true),
+                                            ::testing::Values(DecompressionSubtractType::full),
+                                            ::testing::Values(false),
+                                            ::testing::ValuesIn(filter_additional_config_basic()),
+                                            ::testing::ValuesIn(fusing_params),
+                                            ::testing::Values(should_use_decompression_impl())),
+                         MatmulWeightsDecompression::getTestCaseName);
+
+const std::vector<MatMulDecompressionShapeParams> input_shapes_corner_cases = {
+    {{{-1, -1, -1}, {{1, 4, 16}}}, {1, 16, 32}},
+    {{{-1, -1, -1}, {{1, 4, 16}}}, {16, 32}},
+    {{{-1, -1, -1}, {{1, 5, 16}}}, {16, 32}, 4ul},
+    {{{-1, -1, -1}, {{1, 1, 128}}}, {128, 128}, 16ul},
+};
+
+const std::vector<bool> transpose_weights = {true, false};
+const std::vector<DecompressionSubtractType> decompression_subtract_type = {DecompressionSubtractType::full,
+                                                                            DecompressionSubtractType::scalar,
+                                                                            DecompressionSubtractType::empty};
+const std::vector<bool> reshape_on_decompression = {true, false};
+const std::vector<ov::test::ElementType> decompression_precisions_corner_cases = {ov::element::f16, ov::element::f32};
+
+INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_corner_cases,
+                         MatmulWeightsDecompression,
+                         ::testing::Combine(::testing::ValuesIn(input_shapes_corner_cases),
+                                            ::testing::ValuesIn(weights_precisions),
+                                            ::testing::ValuesIn(decompression_precisions_corner_cases),
+                                            ::testing::Values(ov::element::undefined),
+                                            ::testing::ValuesIn(transpose_weights),
+                                            ::testing::ValuesIn(decompression_subtract_type),
+                                            ::testing::ValuesIn(reshape_on_decompression),
+                                            ::testing::ValuesIn(filter_additional_config_basic()),
+                                            ::testing::Values(emptyFusingSpec),
+                                            ::testing::Values(should_use_decompression_impl())),
+                         MatmulWeightsDecompression::getTestCaseName);
+
+}  // namespace
+}  // namespace test
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp
new file mode 100644
index 00000000000000..e14245f2906e16
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp
@@ -0,0 +1,167 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "matmul_weights_decompression.hpp"
+#include "openvino/runtime/intel_cpu/properties.hpp"
+
+using namespace CPUTestUtils;
+
+namespace ov {
+namespace test {
+
+std::string MatmulWeightsDecompression::getTestCaseName(testing::TestParamInfo<MatmulWeightsDecompressionParams> obj) {
+    MatMulDecompressionShapeParams shape_params;
+    ov::test::ElementType weights_precision;
+    ov::test::ElementType decompression_precision;
+    ov::test::ElementType scale_precision;
+    bool transpose;
+    DecompressionSubtractType decompression_subtract_type;
+    bool reshape_on_decompression;
+    ov::AnyMap additional_config;
+    fusingSpecificParams fusing_params;
+    bool should_fuse;
+
+    std::tie(shape_params,
+             weights_precision,
+             decompression_precision,
+             scale_precision,
+             transpose,
+             decompression_subtract_type,
+             reshape_on_decompression,
+             additional_config,
+             fusing_params,
+             should_fuse) = obj.param;
+
+    std::ostringstream result;
+    result << shape_params << "_";
+    result << "weights_precision=" << weights_precision << "_";
+    result << "decompression_precision=" << decompression_precision << "_";
+    result << "scale_precision=" << scale_precision << "_";
+    result << "transpose_weights=" << transpose << "_";
+    result << "decompression_subtract=" << decompression_subtract_type << "_";
+    result << "reshape_on_decompression=" << reshape_on_decompression << "_";
+
+    result << "config=(";
+    for (const auto& configEntry : additional_config) {
+        result << configEntry.first << ", " << configEntry.second.as<std::string>() << "_";
+    }
+    result << ")";
+    result << CpuTestWithFusing::getTestCaseName(fusing_params);
+
+    return result.str();
+}
+
+std::shared_ptr<ov::Model> MatmulWeightsDecompression::initSubgraph(const ov::PartialShape& data_shape,
+                                                                    const ov::Shape& weights_shape,
+                                                                    const int group_size,
+                                                                    const ov::element::Type data_precision,
+                                                                    const ov::element::Type weights_precision,
+                                                                    const ov::element::Type decompression_precision,
+                                                                    const ov::element::Type scale_precision,
+                                                                    const bool transpose_weights,
+                                                                    const DecompressionSubtractType decompression_subtract_type,
+                                                                    const bool reshape_on_decompression) {
+    ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(data_precision, data_shape)};
+    const auto weights_subgraph = initMatMulDecompressionSubgraph(weights_shape,
+                                                                    group_size,
+                                                                    data_precision,
+                                                                    weights_precision,
+                                                                    decompression_precision,
+                                                                    scale_precision,
+                                                                    transpose_weights,
+                                                                    decompression_subtract_type,
+                                                                    reshape_on_decompression);
+    auto matMul = std::make_shared<ov::op::v0::MatMul>(params[0], weights_subgraph);
+    return makeNgraphFunction(data_precision, params, matMul, "MatmulWeightsDecompression");
+}
+
+void MatmulWeightsDecompression::SetUp() {
+    targetDevice = ov::test::utils::DEVICE_CPU;
+
+    MatMulDecompressionShapeParams shape_params;
+    ov::test::ElementType weights_precision;
+    ov::test::ElementType decompression_precision;
+    ov::test::ElementType scale_precision;
+    bool transpose_weights;
+    DecompressionSubtractType decompression_subtract_type;
+    bool reshape_on_decompression;
+    ov::AnyMap additional_config;
+    fusingSpecificParams fusing_params;
+    bool should_fuse;
+
+    std::tie(shape_params,
+                weights_precision,
+                decompression_precision,
+                scale_precision,
+                transpose_weights,
+                decompression_subtract_type,
+                reshape_on_decompression,
+                additional_config,
+                fusing_params,
+                should_fuse) = GetParam();
+
+    configuration.insert(additional_config.begin(), additional_config.end());
+    std::tie(postOpMgrPtr, fusedOps) = fusing_params;
+    init_input_shapes({shape_params.data_shape});
+
+    if (!configuration.count(ov::hint::dynamic_quantization_group_size.name())) {
+        abs_threshold = 5e-3;
+    }
+
+    // if dynamic quantization is enabled
+    if (configuration.count(ov::hint::dynamic_quantization_group_size.name()) &&
+        configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0) {
+        abs_threshold = 0.1;
+    }
+
+    if (configuration.count(ov::hint::inference_precision.name()) &&
+        configuration.at(ov::hint::inference_precision.name()) == ov::element::f16) {
+        abs_threshold = 0.2;
+    }
+
+    ElementType netType = ov::element::f32;
+    inType = outType = netType;
+
+    function = initSubgraph(inputDynamicShapes[0],
+                            shape_params.weights_shape,
+                            shape_params.decompression_group_size,
+                            netType,
+                            weights_precision,
+                            decompression_precision,
+                            scale_precision,
+                            transpose_weights,
+                            decompression_subtract_type,
+                            reshape_on_decompression);
+}
+
+void MatmulWeightsDecompression::check_results() {
+    const auto& test_param = GetParam();
+    const ov::element::Type compressed_weights_precision = std::get<1>(test_param);
+    const bool use_matmul_decompression_impl = std::get<9>(test_param);
+
+    const auto runtime_model = compiledModel.get_runtime_model();
+    const auto result = runtime_model->get_result();
+    auto fc = result->get_input_node_shared_ptr(0);
+    // Handle precision conversion before output
+    auto type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
+    if (type == "Reorder" || type == "Convert" || type == "Subgraph")
+        fc = fc->get_input_node_shared_ptr(0);
+
+    type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
+    EXPECT_EQ(type, "FullyConnected");
+
+    const auto& expected_weights_precision = use_matmul_decompression_impl
+                                                    ? compressed_weights_precision
+                                                    : fc->get_input_element_type(0);
+    EXPECT_EQ(fc->get_input_element_type(1), expected_weights_precision);
+}
+
+TEST_P(MatmulWeightsDecompression, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    run();
+    check_results();
+}
+
+}  // namespace test
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp
new file mode 100644
index 00000000000000..266aab8e445928
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "shared_test_classes/subgraph/weights_decompression_builders.hpp"
+#include "utils/cpu_test_utils.hpp"
+#include "utils/fusing_test_utils.hpp"
+
+using namespace CPUTestUtils;
+
+namespace ov {
+namespace test {
+
+/*
+ * WP - weights precision
+ * DP - decompression precision
+ * IP - input precision
+ * SP - scale precision
+ * Opt - optional
+ *                        Subtract_const(WP)
+ *                           /
+ *    Weights(WP)     Convert(DP)
+ *       |               /           Multiply_const(SP)
+ *    Convert(DP)   Reshape (Opt)      /
+ *            \        /          Convert(if SP != DP)
+ *            Subtract(Opt)       /
+ *                  \         Reshape (Opt)
+ *                   \         /
+ *                    Multiply
+ *                      |
+ *                   Reshape (in case of group decompression)
+ *                      |
+ *                   Convert (if IP != DP)
+ *                      |
+ *      Data(IP)   Transpose(Opt)
+ *            \     /
+ *             Matmul
+ *               |
+ *              Bias
+ */
+typedef std::tuple<MatMulDecompressionShapeParams,
+                   ov::test::ElementType,      // weights precision
+                   ov::test::ElementType,      // decompression precision
+                   ov::test::ElementType,      // scale precision
+                   bool,                       // transpose on weights
+                   DecompressionSubtractType,  // decompression subtract type
+                   bool,                       // reshape on decompression constants
+                   ov::AnyMap,                 // additional config
+                   fusingSpecificParams,
+                   bool>  // should use decompression implementation
+    MatmulWeightsDecompressionParams;
+
+class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeightsDecompressionParams>,
+                                   virtual public SubgraphBaseTest,
+                                   public CpuTestWithFusing {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<MatmulWeightsDecompressionParams> obj);
+
+protected:
+    std::shared_ptr<ov::Model> initSubgraph(const ov::PartialShape& data_shape,
+                                            const ov::Shape& weights_shape,
+                                            const int group_size,
+                                            const ov::element::Type data_precision,
+                                            const ov::element::Type weights_precision,
+                                            const ov::element::Type decompression_precision,
+                                            const ov::element::Type scale_precision,
+                                            const bool transpose_weights,
+                                            const DecompressionSubtractType decompression_subtract_type,
+                                            const bool reshape_on_decompression);
+
+    void SetUp() override;
+
+    void check_results();
+};
+
+}  // namespace test
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp
index 9a434943893eed..5a5a375566b955 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp
@@ -1,200 +1,13 @@
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2018-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "shared_test_classes/base/ov_subgraph.hpp"
-#include "utils/fusing_test_utils.hpp"
-#include "openvino/runtime/intel_cpu/properties.hpp"
-#include "shared_test_classes/subgraph/weights_decompression_builders.hpp"
+#include "custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp"
 
 using namespace CPUTestUtils;
 
 namespace ov {
 namespace test {
-/*
- * WP - weights precision
- * DP - decompression precision
- * IP - input precision
- * SP - scale precision
- * Opt - optional
- *                        Subtract_const(WP)
- *                           /
- *    Weights(WP)     Convert(DP)
- *       |               /           Multiply_const(SP)
- *    Convert(DP)   Reshape (Opt)      /
- *            \        /          Convert(if SP != DP)
- *            Subtract(Opt)       /
- *                  \         Reshape (Opt)
- *                   \         /
- *                    Multiply
- *                      |
- *                   Reshape (in case of group decompression)
- *                      |
- *                   Convert (if IP != DP)
- *                      |
- *      Data(IP)   Transpose(Opt)
- *            \     /
- *             Matmul
- *               |
- *              Bias
- */
-using MatmulWeightsDecompressionParams = std::tuple<MatMulDecompressionShapeParams,
-                                                    ov::test::ElementType,      // weights precision
-                                                    ov::test::ElementType,      // decompression precision
-                                                    ov::test::ElementType,      // scale precision
-                                                    bool,                       // transpose on weights
-                                                    DecompressionSubtractType,  // decompression subtract type
-                                                    bool,                       // reshape on decompression constants
-                                                    ov::AnyMap,                 // additional config
-                                                    fusingSpecificParams,
-                                                    bool>;                      // should use decompression implementation
-
-class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeightsDecompressionParams>,
-                                   virtual public SubgraphBaseTest,
-                                   public CpuTestWithFusing {
-public:
-    static std::string getTestCaseName(testing::TestParamInfo<MatmulWeightsDecompressionParams> obj) {
-        MatMulDecompressionShapeParams shape_params;
-        ov::test::ElementType weights_precision;
-        ov::test::ElementType decompression_precision;
-        ov::test::ElementType scale_precision;
-        bool transpose;
-        DecompressionSubtractType decompression_subtract_type;
-        bool reshape_on_decompression;
-        ov::AnyMap additional_config;
-        fusingSpecificParams fusing_params;
-        bool should_fuse;
-
-        std::tie(shape_params,
-                 weights_precision,
-                 decompression_precision,
-                 scale_precision,
-                 transpose,
-                 decompression_subtract_type,
-                 reshape_on_decompression,
-                 additional_config,
-                 fusing_params,
-                 should_fuse) = obj.param;
-
-        std::ostringstream result;
-        result << shape_params << "_";
-        result << "weights_precision=" << weights_precision << "_";
-        result << "decompression_precision=" << decompression_precision << "_";
-        result << "scale_precision=" << scale_precision << "_";
-        result << "transpose_weights=" << transpose << "_";
-        result << "decompression_subtract=" << decompression_subtract_type << "_";
-        result << "reshape_on_decompression=" << reshape_on_decompression << "_";
-
-        result << "config=(";
-        for (const auto& configEntry : additional_config) {
-            result << configEntry.first << ", " << configEntry.second.as<std::string>() << "_";
-        }
-        result << ")";
-        result << CpuTestWithFusing::getTestCaseName(fusing_params);
-
-        return result.str();
-    }
-
-protected:
-    std::shared_ptr<ov::Model> initSubgraph(const ov::PartialShape& data_shape,
-                                            const ov::Shape& weights_shape,
-                                            const int group_size,
-                                            const ov::element::Type data_precision,
-                                            const ov::element::Type weights_precision,
-                                            const ov::element::Type decompression_precision,
-                                            const ov::element::Type scale_precision,
-                                            const bool transpose_weights,
-                                            const DecompressionSubtractType decompression_subtract_type,
-                                            const bool reshape_on_decompression) {
-        ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(data_precision, data_shape)};
-        const auto weights_subgraph = initMatMulDecompressionSubgraph(weights_shape,
-                                                                      group_size,
-                                                                      data_precision,
-                                                                      weights_precision,
-                                                                      decompression_precision,
-                                                                      scale_precision,
-                                                                      transpose_weights,
-                                                                      decompression_subtract_type,
-                                                                      reshape_on_decompression);
-        auto matMul = std::make_shared<ov::op::v0::MatMul>(params[0], weights_subgraph);
-        return makeNgraphFunction(data_precision, params, matMul, "MatmulWeightsDecompression");
-    }
-
-    void SetUp() override {
-        targetDevice = ov::test::utils::DEVICE_CPU;
-
-        MatMulDecompressionShapeParams shape_params;
-        ov::test::ElementType weights_precision;
-        ov::test::ElementType decompression_precision;
-        ov::test::ElementType scale_precision;
-        bool transpose_weights;
-        DecompressionSubtractType decompression_subtract_type;
-        bool reshape_on_decompression;
-        ov::AnyMap additional_config;
-        fusingSpecificParams fusing_params;
-        bool should_fuse;
-
-        std::tie(shape_params,
-                 weights_precision,
-                 decompression_precision,
-                 scale_precision,
-                 transpose_weights,
-                 decompression_subtract_type,
-                 reshape_on_decompression,
-                 additional_config,
-                 fusing_params,
-                 should_fuse) = GetParam();
-
-        configuration.insert(additional_config.begin(), additional_config.end());
-        std::tie(postOpMgrPtr, fusedOps) = fusing_params;
-        init_input_shapes({shape_params.data_shape});
-
-        // if dynamic quantization is enabled
-        if (configuration.count(ov::hint::dynamic_quantization_group_size.name()) &&
-            configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0) {
-            abs_threshold = 0.1;
-        } else if (!configuration.count(ov::hint::dynamic_quantization_group_size.name())) {
-            abs_threshold = 5e-3;
-        }
-
-        ElementType netType = ov::element::f32;
-        inType = outType = netType;
-
-        function = initSubgraph(inputDynamicShapes[0],
-                                shape_params.weights_shape,
-                                shape_params.decompression_group_size,
-                                netType,
-                                weights_precision,
-                                decompression_precision,
-                                scale_precision,
-                                transpose_weights,
-                                decompression_subtract_type,
-                                reshape_on_decompression);
-    }
-
-    void check_results() {
-        const auto& test_param = GetParam();
-        const ov::element::Type compressed_weights_precision = std::get<1>(test_param);
-        const bool use_matmul_decompression_impl = std::get<9>(test_param);
-
-        const auto runtime_model = compiledModel.get_runtime_model();
-        const auto result = runtime_model->get_result();
-        const auto fc = result->get_input_node_shared_ptr(0);
-        const auto type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
-        EXPECT_EQ(type, "FullyConnected");
-
-        const auto& expected_weights_precision = use_matmul_decompression_impl
-                                                     ? compressed_weights_precision
-                                                     : fc->get_input_element_type(0);
-        EXPECT_EQ(fc->get_input_element_type(1), expected_weights_precision);
-    }
-};
-
-TEST_P(MatmulWeightsDecompression, CompareWithRefs) {
-    SKIP_IF_CURRENT_TEST_IS_DISABLED()
-    run();
-    check_results();
-}
 
 namespace {
 
@@ -205,7 +18,8 @@ std::vector<ov::AnyMap> filter_additional_config_basic() {
 std::vector<ov::AnyMap> filter_additional_config_amx() {
     std::vector<ov::AnyMap> additional_config = {};
     if (ov::with_cpu_x86_avx512_core_amx())
-        additional_config.push_back({{ov::hint::dynamic_quantization_group_size(0), ov::hint::inference_precision(ov::element::bf16)}});
+        additional_config.push_back(
+            {{ov::hint::dynamic_quantization_group_size(0), ov::hint::inference_precision(ov::element::bf16)}});
     return additional_config;
 }
 
@@ -310,8 +124,9 @@ const std::vector<MatMulDecompressionShapeParams> input_shapes_corner_cases_amx
 };
 
 const std::vector<bool> transpose_weights = {true, false};
-const std::vector<DecompressionSubtractType> decompression_subtract_type = {
-    DecompressionSubtractType::full, DecompressionSubtractType::scalar, DecompressionSubtractType::empty};
+const std::vector<DecompressionSubtractType> decompression_subtract_type = {DecompressionSubtractType::full,
+                                                                            DecompressionSubtractType::scalar,
+                                                                            DecompressionSubtractType::empty};
 const std::vector<bool> reshape_on_decompression = {true, false};
 const std::vector<ov::test::ElementType> decompression_precisions_corner_cases = {ov::element::f16, ov::element::f32};
 
@@ -387,12 +202,11 @@ const std::vector<MatMulDecompressionShapeParams> input_shapes_basic_dyn_quant =
     {{{}, {{1, 1, 1728}}}, {1728, 128}, 64lu},
 };
 
-const std::vector<ov::test::ElementType> weights_precisions_dyn_quant = {ov::element::u8,
-                                                                         ov::element::u4};
+const std::vector<ov::test::ElementType> weights_precisions_dyn_quant = {ov::element::u8, ov::element::u4};
 
 std::vector<ov::AnyMap> filter_additional_config_dyn_quant() {
     std::vector<ov::AnyMap> additional_config = {
-        {{ov::hint::dynamic_quantization_group_size(0)}}, // dynamic quantization is disabled
+        {{ov::hint::dynamic_quantization_group_size(0)}},  // dynamic quantization is disabled
         {{ov::hint::dynamic_quantization_group_size(16)}},
         {{ov::hint::dynamic_quantization_group_size(128)}},
     };
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index 4c21c06c491179..7af707df602bfc 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -481,7 +481,6 @@ std::vector<std::string> disabledTestPatterns() {
     // Issue 88764, 91647, 108802: accuracy issue
     retVector.emplace_back(R"(MultipleLSTMCellTest/MultipleLSTMCellTest.CompareWithRefs.*)");
     // Compressed weights are not supported
-    retVector.emplace_back(R"(smoke_MatMulCompressedWeights.*)");
     retVector.emplace_back(R"(smoke_MatMulSharedCompressedWeights.*)");
     retVector.emplace_back(R"(smoke_MatmulAndGatherSharedWeightsDecompression.*)");
     // smoke_Snippets test cases are not supported on arm32 platforms