From cc9882f95291f85e94db33177de79b49b183167a Mon Sep 17 00:00:00 2001
From: Maksim Kutakov <maksim.kutakov@intel.com>
Date: Wed, 15 Mar 2023 17:26:06 +0100
Subject: [PATCH 1/5] First step FC optimized

---
 .../src/memory_desc/dnnl_memory_desc.cpp      |  9 +++-
 .../src/memory_desc/dnnl_memory_desc.h        |  1 +
 .../src/nodes/common/dnnl_executor.cpp        | 21 ----------
 .../src/nodes/common/dnnl_executor.h          | 33 +++++++++++++--
 .../intel_cpu/src/nodes/fullyconnected.cpp    | 41 ++++++++++---------
 .../intel_cpu/src/nodes/fullyconnected.h      |  2 +
 6 files changed, 61 insertions(+), 46 deletions(-)
diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp
index 1f2a17189a31cc..0458f93836779d 100644
--- a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp
+++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp
@@ -36,13 +36,18 @@ MemoryDescPtr DnnlMemoryDesc::cloneWithNewPrecision(const InferenceEngine::Preci
 }
 
 bool DnnlMemoryDesc::isCompatible(const MemoryDesc &rhs) const {
-    if (MemoryDescType::Dnnl == rhs.getType()) {
-        return this->desc == rhs.as<DnnlMemoryDesc>()->desc;
+    if (MemoryDescType::Dnnl & rhs.getType()) {
+        auto* dnnMemDesc = rhs.as<DnnlMemoryDesc>();
+        return isCompatible(*dnnMemDesc);
     } else {
         return false;
     }
 }
 
+bool DnnlMemoryDesc::isCompatible(const DnnlMemoryDesc& rhs) const {
+    return this->desc == rhs.desc;
+}
+
 std::string DnnlMemoryDesc::serializeFormat() const {
     dnnl::impl::memory_desc_wrapper wrapped(desc.get());
     if (wrapped.is_wino_desc()) {
diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h
index c6a88794485c40..373e66679f8824 100644
--- a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h
+++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h
@@ -26,6 +26,7 @@ class DnnlMemoryDesc : public virtual MemoryDesc {
     MemoryDescPtr cloneWithNewPrecision(const InferenceEngine::Precision prec) const override;
 
     bool isCompatible(const MemoryDesc& rhs) const override;
+    bool isCompatible(const DnnlMemoryDesc& rhs) const;
 
     bool hasLayoutType(LayoutType layoutType) const override { return false; }
 
diff --git a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp
index 3f055cc63fe039..ca09526e4a4c8b 100644
--- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp
+++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp
@@ -58,27 +58,6 @@ const_dnnl_primitive_desc_t DnnlExecutor::getPrimitiveDesc() const {
     return execPrim.get_primitive_desc();
 }
 
-dnnl::memory::desc DnnlExecutor::getSrcDesc() const {
-    auto pd = getPrimitiveDesc();
-    auto md = DnnlExtensionUtils::query_md(pd, dnnl::query::src_md);
-
-    return md->getDnnlDesc();
-}
-
-dnnl::memory::desc DnnlExecutor::getWeightDesc() const {
-    auto pd = getPrimitiveDesc();
-    auto md = DnnlExtensionUtils::query_md(pd, dnnl::query::weights_md);
-
-    return md->getDnnlDesc();
-}
-
-dnnl::memory::desc DnnlExecutor::getDstDesc() const {
-    auto pd = getPrimitiveDesc();
-    auto md = DnnlExtensionUtils::query_md(pd, dnnl::query::dst_md);
-
-    return md->getDnnlDesc();
-}
-
 impl_desc_type DnnlExecutor::getImplementationType() const {
     auto pd = getPrimitiveDesc();
     return parse_impl_name(DnnlExtensionUtils::query_impl_info_str(pd));
diff --git a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h
index f824fd8146ecb6..21a6d4f4634bbc 100644
--- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h
+++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h
@@ -31,17 +31,44 @@ class DnnlExecutor {
         virtual ~DnnlExecutor() = default;
         dnnl::primitive getExecPrim() const;
         const_dnnl_primitive_desc_t getPrimitiveDesc() const;
-        dnnl::memory::desc getSrcDesc() const;
-        dnnl::memory::desc getWeightDesc() const;
-        dnnl::memory::desc getDstDesc() const;
         impl_desc_type getImplementationType() const;
 
+        DnnlMemoryDescPtr getSrcDesc() const {
+            return src_md;
+        }
+        DnnlMemoryDescPtr getWeightDesc() const {
+            return wghts_md;
+        }
+        DnnlMemoryDescPtr getDstDesc() const {
+            return dst_md;
+        }
+        DnnlMemoryDescPtr getScratchPadDesc() const {
+            return scrch_md;
+        }
+
+        const dnnl::memory::desc& getDnnlSrcDesc() const {
+            return src_md->getDnnlDesc();
+        }
+        const dnnl::memory::desc& getDnnlWeightDesc() const {
+            return wghts_md->getDnnlDesc();
+        }
+        const dnnl::memory::desc& getDnnlDstDesc() const {
+            return dst_md->getDnnlDesc();
+        }
+        const dnnl::memory::desc& getDnnlScratchPadDesc() const {
+            return scrch_md->getDnnlDesc();
+        }
+
     protected:
         DnnlExecutor() = default;
         dnnl::primitive execPrim;
         // key is the port number for the primitive that needs memory reordering
         std::unordered_map<int, IntermReorder> inputReorders;
         std::unordered_map<int, IntermReorder> outputReorders;
+        DnnlMemoryDescPtr src_md;
+        DnnlMemoryDescPtr wghts_md;
+        DnnlMemoryDescPtr dst_md;
+        DnnlMemoryDescPtr scrch_md;
 };
 
 }   // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 23b6c5be7cee9f..63aa4a70d2fc9a 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -311,7 +311,7 @@ void FullyConnected::prepareParams() {
                  implementationTypeIP,
                  useConv1x1};
 
-    auto engine = getEngine();
+    auto& engine = getEngine();
 
     auto builder = [&engine](const FCKey& key) -> executorPtr {
         executorPtr execPtr = nullptr;
@@ -404,23 +404,23 @@ void FullyConnected::prepareParams() {
     execPtr = result.first;
 
     if (execPtr) {
-        // no executor yet or shapes changed
-        if (!prevExecPtr || prevExecPtr->getSrcDesc() != execPtr->getSrcDesc()) {
-            auto oldMem = srcMemPtr->GetPrimitive();
-            // fast path: wanted is same with parent node output, typical is static shape with inner product
-            if (execPtr->getSrcDesc() == inDesc->getDnnlDesc()) {
-                primArgs[DNNL_ARG_SRC] = std::move(oldMem);
-            } else {
-                primArgs[DNNL_ARG_SRC] = dnnl::memory(execPtr->getSrcDesc(), oldMem.get_engine(), oldMem.get_data_handle());
-            }
+        if (execPtr->getSrcDesc()->isCompatible(*inDesc)) {
+            primArgs[DNNL_ARG_SRC] = srcMemPtr->GetPrimitive();
+        } else {
+            auto start = std::chrono::steady_clock::now();
+            primArgs[DNNL_ARG_SRC] = dnnl::memory(execPtr->getDnnlSrcDesc(), engine, srcMemPtr->GetData()); //385.681 [ms]
+            auto end = std::chrono::steady_clock::now();
+            g_counters[8] += std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
         }
-        if (!prevExecPtr || prevExecPtr->getDstDesc() != execPtr->getDstDesc()) {
-            auto oldMem = dstMemPtr->GetPrimitive();
-            if (execPtr->getDstDesc() == outDesc->getDnnlDesc()) {
-                primArgs[DNNL_ARG_DST] = std::move(oldMem);
-            } else {
-                primArgs[DNNL_ARG_DST] = dnnl::memory(execPtr->getDstDesc(), oldMem.get_engine(), oldMem.get_data_handle());
-            }
+
+        if (execPtr->getDstDesc()->isCompatible(*outDesc)) {
+            primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();
+        } else {
+            primArgs[DNNL_ARG_DST] = dnnl::memory(execPtr->getDnnlDstDesc(), engine, dstMemPtr->GetData());
+        }
+
+        if (!prevExecPtr || !execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
+            primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->GetPrimitive();
         }
         if (!prevExecPtr || prevExecPtr->getWeightDesc() != execPtr->getWeightDesc()) {
             primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(DnnlExtensionUtils::makeDescriptor(execPtr->getWeightDesc()))->GetPrimitive();
@@ -438,9 +438,10 @@ void FullyConnected::prepareParams() {
             primArgs[DNNL_ARG_BIAS] = biasMemPtr->GetPrimitive();
         }
 
-        auto pd = execPtr->getPrimitiveDesc();
-        auto scratchpadMem = getScratchPadMem(pd);
-        primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
+        if (!scratchPad || !scratchPad->getDesc().isCompatible(*(execPtr->getScratchPadDesc()))) {
+            scratchPad = context->getScratchPad()->createScratchPadMem(execPtr->getScratchPadDesc());
+        }
+        primArgs[DNNL_ARG_SCRATCHPAD] = scratchPad->GetPrimitive();
 #ifdef CPU_DEBUG_CAPS
         if (result.second == CacheEntryBase::LookUpStatus::Miss) {
             DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index 4de5dff882649d..30fbbcb38b9884 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -116,6 +116,8 @@ class FullyConnected : public Node {
     float minSparseRate = 1.f;
     float weiSparseRate = 0.f;
     bool useSparseWeightsDecompression();
+
+    MemoryPtr scratchPad;
 };
 
 }   // namespace node

From 26573d694e00351acdde14c8e4057579a7193f57 Mon Sep 17 00:00:00 2001
From: Maksim Kutakov <maksim.kutakov@intel.com>
Date: Wed, 15 Mar 2023 18:17:33 +0100
Subject: [PATCH 2/5] Code fix

---
 src/plugins/intel_cpu/src/nodes/fullyconnected.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 63aa4a70d2fc9a..e338809496c272 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -407,10 +407,7 @@ void FullyConnected::prepareParams() {
         if (execPtr->getSrcDesc()->isCompatible(*inDesc)) {
             primArgs[DNNL_ARG_SRC] = srcMemPtr->GetPrimitive();
         } else {
-            auto start = std::chrono::steady_clock::now();
-            primArgs[DNNL_ARG_SRC] = dnnl::memory(execPtr->getDnnlSrcDesc(), engine, srcMemPtr->GetData()); //385.681 [ms]
-            auto end = std::chrono::steady_clock::now();
-            g_counters[8] += std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
+            primArgs[DNNL_ARG_SRC] = dnnl::memory(execPtr->getDnnlSrcDesc(), engine, srcMemPtr->GetData());
         }
 
         if (execPtr->getDstDesc()->isCompatible(*outDesc)) {
@@ -422,9 +419,6 @@ void FullyConnected::prepareParams() {
         if (!prevExecPtr || !execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
             primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->GetPrimitive();
         }
-        if (!prevExecPtr || prevExecPtr->getWeightDesc() != execPtr->getWeightDesc()) {
-            primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(DnnlExtensionUtils::makeDescriptor(execPtr->getWeightDesc()))->GetPrimitive();
-        }
         // changed shapes may also cause the kernel type changed
         selected_pd->setImplementationType(execPtr->getImplementationType());
         // WA: We update implType to know whether weights decompression was used inside the kernel

From 2c1862b041677245100eb1cdec4781572f0015c8 Mon Sep 17 00:00:00 2001
From: Maksim Kutakov <maksim.kutakov@intel.com>
Date: Wed, 15 Mar 2023 18:59:49 +0100
Subject: [PATCH 3/5] Fix further

---
 src/plugins/intel_cpu/src/nodes/fullyconnected.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index e338809496c272..33141cb48e3073 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -915,10 +915,18 @@ bool FullyConnected::canBeExecutedInConv1x1() const {
 }
 
 FullyConnected::ExecutorInnerProduct::ExecutorInnerProduct(const dnnl::inner_product_forward::primitive_desc& pd) {
+    src_md = DnnlExtensionUtils::makeDescriptor(pd.src_desc());
+    dst_md = DnnlExtensionUtils::makeDescriptor(pd.dst_desc());
+    wghts_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc());
+    scrch_md = DnnlExtensionUtils::makeDescriptor(pd.scratchpad_desc());
     execPrim = dnnl::inner_product_forward(pd);
 }
 
 FullyConnected::ExecutorConv1x1::ExecutorConv1x1(const dnnl::convolution_forward::primitive_desc& pd) {
+    src_md = DnnlExtensionUtils::makeDescriptor(pd.src_desc());
+    dst_md = DnnlExtensionUtils::makeDescriptor(pd.dst_desc());
+    wghts_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc());
+    scrch_md = DnnlExtensionUtils::makeDescriptor(pd.scratchpad_desc());
     execPrim = dnnl::convolution_forward(pd);
 }
 

From 0ee9d3b1863665bcaae01b3239f01537c5aaa24e Mon Sep 17 00:00:00 2001
From: Maksim Kutakov <maksim.kutakov@intel.com>
Date: Fri, 17 Mar 2023 13:28:25 +0100
Subject: [PATCH 4/5] Move toward dnnl executor

---
 src/plugins/intel_cpu/src/node.cpp            |  6 -----
 src/plugins/intel_cpu/src/node.h              | 10 +++----
 .../src/nodes/common/dnnl_executor.cpp        | 18 ++++++++++++-
 .../src/nodes/common/dnnl_executor.h          |  7 +++--
 src/plugins/intel_cpu/src/nodes/concat.h      |  1 +
 src/plugins/intel_cpu/src/nodes/conv.cpp      |  7 ++---
 src/plugins/intel_cpu/src/nodes/deconv.cpp    | 11 +++-----
 .../intel_cpu/src/nodes/fullyconnected.cpp    | 26 +++---------------
 .../intel_cpu/src/nodes/fullyconnected.h      | 12 ---------
 src/plugins/intel_cpu/src/nodes/input.h       |  1 +
 src/plugins/intel_cpu/src/nodes/interaction.h |  1 +
 src/plugins/intel_cpu/src/nodes/lrn.cpp       | 27 ++++++++++++-------
 src/plugins/intel_cpu/src/nodes/lrn.h         |  4 +++
 src/plugins/intel_cpu/src/nodes/matmul.cpp    | 22 +++++++++------
 src/plugins/intel_cpu/src/nodes/matmul.h      |  4 +++
 src/plugins/intel_cpu/src/nodes/pooling.cpp   | 27 ++++++++++++-------
 src/plugins/intel_cpu/src/nodes/pooling.h     |  5 ++++
 src/plugins/intel_cpu/src/nodes/reorder.cpp   |  6 ++++-
 src/plugins/intel_cpu/src/nodes/reorder.h     |  1 +
 src/plugins/intel_cpu/src/nodes/rnn.cpp       | 19 +++++++------
 src/plugins/intel_cpu/src/nodes/rnn.h         |  5 ++++
 src/plugins/intel_cpu/src/nodes/softmax.cpp   | 26 +++++++++++-------
 src/plugins/intel_cpu/src/nodes/softmax.h     |  5 ++++
 src/plugins/intel_cpu/src/nodes/transpose.h   |  1 +
 24 files changed, 142 insertions(+), 110 deletions(-)

diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
index 03529f39d1c003..64752ea8692fdd 100644
--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@@ -550,12 +550,6 @@ std::vector<memory::format_tag> Node::getAvailableFormatsForDims(const Shape &di
     return {memory::format_tag::any};
 }
 
-void Node::execute(dnnl::stream strm) {
-    if (prim) {
-        prim.execute(strm, primArgs);
-    }
-}
-
 void Node::updateShapes() {
     IE_ASSERT(isDynamicNode()) << "Node::updateShapes() is called to a static shape node of type: " << getTypeStr() << " with name: " << getName();
     if (needShapeInfer()) {
diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h
index 0d15441972af92..dd78bfd0159b85 100644
--- a/src/plugins/intel_cpu/src/node.h
+++ b/src/plugins/intel_cpu/src/node.h
@@ -334,7 +334,7 @@ class Node {
 
     void resolveInPlaceEdges();
 
-    virtual void execute(dnnl::stream strm);
+    virtual void execute(dnnl::stream strm) = 0;
     void updateShapes();
     void updateDynamicParams();
     void executeDynamic(dnnl::stream strm);
@@ -578,7 +578,6 @@ class Node {
     std::vector<NodeDesc> supportedPrimitiveDescriptors;
     std::unordered_map<int, dnnl::memory> primArgs;
     std::unordered_map<int, MemoryPtr> postOpsArgs;
-    dnnl::primitive prim;
     std::vector<dnnl::primitive_desc> descs;
 
     const GraphContext::CPtr context;
@@ -649,9 +648,10 @@ class Node {
         IE_THROW(NotImplemented) << "[DS] prapareParams not implemented for node with type " << NameFromType(getType());
     }
 
-    MemoryPtr getScratchPadMem(const const_dnnl_primitive_desc_t& pd) {
-        auto scratchpadMemoryDesc = DnnlExtensionUtils::query_md(pd, dnnl::query::scratchpad_md);
-        scratchpadMem = context->getScratchPad()->createScratchPadMem(scratchpadMemoryDesc);
+    MemoryPtr getScratchPadMem(const DnnlMemoryDescPtr& desc) {
+        if (!scratchpadMem || !scratchpadMem->getDesc().isCompatible(*desc)) {
+            scratchpadMem = context->getScratchPad()->createScratchPadMem(desc);
+        }
         return scratchpadMem;
     }
 
diff --git a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp
index ca09526e4a4c8b..7d337457494de9 100644
--- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp
+++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp
@@ -9,6 +9,14 @@ using namespace dnnl;
 namespace ov {
 namespace intel_cpu {
 
+DnnlExecutor::DnnlExecutor(const dnnl::primitive_desc& pd) {
+    execPrim = dnnl::primitive(pd);
+    src_md = DnnlExtensionUtils::makeDescriptor(pd.src_desc());
+    dst_md = DnnlExtensionUtils::makeDescriptor(pd.dst_desc());
+    wghts_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc());
+    scrch_md = DnnlExtensionUtils::makeDescriptor(pd.scratchpad_desc());
+}
+
 DnnlExecutor::IntermReorder::IntermReorder(const dnnl::memory::desc& descSrc,
                                            const dnnl::memory::desc& descDst,
                                            const dnnl::engine& engine) : m_descSrc(descSrc), m_descDst(descDst) {
@@ -20,7 +28,15 @@ void DnnlExecutor::IntermReorder::exec(dnnl::memory& memSrc, dnnl::memory& memDs
     m_reorder.execute(strm, memSrc, memDst);
 }
 
-void DnnlExecutor::exec(std::unordered_map<int, dnnl::memory> primArgs, dnnl::stream strm) {
+void DnnlExecutor::exec(const std::unordered_map<int, dnnl::memory>& primArgs, dnnl::stream strm) {
+    if (inputReorders.empty() && outputReorders.empty()) {
+        execPrim.execute(strm, primArgs);
+    } else {
+        reorder_exec(primArgs, strm);
+    }
+}
+
+void DnnlExecutor::reorder_exec(std::unordered_map<int, dnnl::memory> primArgs, dnnl::stream strm) {
     for (auto &inReorder : inputReorders) {
         if (primArgs.count(inReorder.first)) {
             dnnl::memory memDst(inReorder.second.getDstDesc(), strm.get_engine());
diff --git a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h
index 21a6d4f4634bbc..0f3eff13797eef 100644
--- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h
+++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h
@@ -26,7 +26,8 @@ class DnnlExecutor {
         };
 
     public:
-        void exec(std::unordered_map<int, dnnl::memory> primArgs, dnnl::stream strm);
+        explicit DnnlExecutor(const dnnl::primitive_desc& pd);
+        void exec(const std::unordered_map<int, dnnl::memory>& primArgs, dnnl::stream strm);
         bool needReordering() const;
         virtual ~DnnlExecutor() = default;
         dnnl::primitive getExecPrim() const;
@@ -60,7 +61,9 @@ class DnnlExecutor {
         }
 
     protected:
-        DnnlExecutor() = default;
+        void reorder_exec(std::unordered_map<int, dnnl::memory> primArgs, dnnl::stream strm);
+
+    protected:
         dnnl::primitive execPrim;
         // key is the port number for the primitive that needs memory reordering
         std::unordered_map<int, IntermReorder> inputReorders;
diff --git a/src/plugins/intel_cpu/src/nodes/concat.h b/src/plugins/intel_cpu/src/nodes/concat.h
index 9a0a8a66274321..32831bcede332a 100644
--- a/src/plugins/intel_cpu/src/nodes/concat.h
+++ b/src/plugins/intel_cpu/src/nodes/concat.h
@@ -52,6 +52,7 @@ class Concat : public Node {
     InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
     bool canExecRef = false;
     static constexpr size_t MAX_RANK_REF = 6;
+    dnnl::primitive prim;
 };
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp
index 3ce92de2169f6c..002af7c98b24f4 100644
--- a/src/plugins/intel_cpu/src/nodes/conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/conv.cpp
@@ -1495,8 +1495,7 @@ void Convolution::prepareParams() {
 
         Node::appendPostOpArgs(*pAttrLocal, primArgs, convPostOpsArgs[preferLegacyPostOps]);
 
-        auto pd = execPtr->getPrimitiveDesc();
-        auto scratchpadMem = getScratchPadMem(pd);
+        auto scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
         primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
 
 #ifdef CPU_DEBUG_CAPS
@@ -1513,9 +1512,7 @@ Convolution::ConvolutionExecutor::ConvolutionExecutor(const dnnl::convolution_fo
                                                                 const dnnl::memory::desc& inMemDesc,
                                                                 const dnnl::memory::desc& weightMemDesc,
                                                                 const dnnl::memory::desc& outMemDesc,
-                                                                const dnnl::engine& engine) {
-    execPrim = dnnl::convolution_forward(pd);
-
+                                                                const dnnl::engine& engine) : DnnlExecutor(pd) {
     if (inMemDesc != pd.src_desc()) {
         inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, pd.src_desc(), engine)});
     }
diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp
index db013ced146e6d..a8dbae4d00a471 100644
--- a/src/plugins/intel_cpu/src/nodes/deconv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp
@@ -991,8 +991,7 @@ void Deconvolution::prepareParams() {
         }
         Node::appendPostOpArgs(*pAttrLocal, primArgs, postOpsArgs);
 
-        auto pd = execPtr->getPrimitiveDesc();
-        auto scratchpadMem = getScratchPadMem(pd);
+        auto scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
         primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
 #ifdef CPU_DEBUG_CAPS
         if (result.second == CacheEntryBase::LookUpStatus::Miss) {
@@ -1094,9 +1093,7 @@ Deconvolution::DeconvExecutorDefault::DeconvExecutorDefault(const dnnl::convolut
                                                                       const dnnl::memory::desc& inMemDesc,
                                                                       const dnnl::memory::desc& weightMemDesc,
                                                                       const dnnl::memory::desc& outMemDesc,
-                                                                      const dnnl::engine& engine) {
-    execPrim = dnnl::convolution_backward_data(pd);
-
+                                                                      const dnnl::engine& engine) : DnnlExecutor(pd) {
     if (inMemDesc != pd.diff_dst_desc()) {
         inputReorders.insert({DNNL_ARG_DIFF_DST, IntermReorder(inMemDesc, pd.diff_dst_desc(), engine)});
     }
@@ -1114,9 +1111,7 @@ Deconvolution::DeconvExecutorInt8::DeconvExecutorInt8(const dnnl::deconvolution_
                                                                 const dnnl::memory::desc& inMemDesc,
                                                                 const dnnl::memory::desc& weightMemDesc,
                                                                 const dnnl::memory::desc& outMemDesc,
-                                                                const dnnl::engine& engine) {
-    execPrim = dnnl::deconvolution_forward(pd);
-
+                                                                const dnnl::engine& engine) : DnnlExecutor(pd) {
     if (inMemDesc != pd.src_desc()) {
         inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, pd.src_desc(), engine)});
     }
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 33141cb48e3073..18e2353d40395a 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -333,7 +333,7 @@ void FullyConnected::prepareParams() {
             }
 
             if (prim_desc) {
-                execPtr = std::make_shared<ExecutorConv1x1>(prim_desc);
+                execPtr = std::make_shared<DnnlExecutor>(prim_desc);
             }
         }
         // fallback
@@ -388,7 +388,7 @@ void FullyConnected::prepareParams() {
                 }
             }
 
-            execPtr = std::make_shared<ExecutorInnerProduct>(prim_desc);
+            execPtr = std::make_shared<DnnlExecutor>(prim_desc);
         }
         return execPtr;
     };
@@ -432,10 +432,8 @@ void FullyConnected::prepareParams() {
             primArgs[DNNL_ARG_BIAS] = biasMemPtr->GetPrimitive();
         }
 
-        if (!scratchPad || !scratchPad->getDesc().isCompatible(*(execPtr->getScratchPadDesc()))) {
-            scratchPad = context->getScratchPad()->createScratchPadMem(execPtr->getScratchPadDesc());
-        }
-        primArgs[DNNL_ARG_SCRATCHPAD] = scratchPad->GetPrimitive();
+        auto schratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
+        primArgs[DNNL_ARG_SCRATCHPAD] = schratchpadMem->GetPrimitive();
 #ifdef CPU_DEBUG_CAPS
         if (result.second == CacheEntryBase::LookUpStatus::Miss) {
             DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
@@ -914,22 +912,6 @@ bool FullyConnected::canBeExecutedInConv1x1() const {
     return retVal;
 }
 
-FullyConnected::ExecutorInnerProduct::ExecutorInnerProduct(const dnnl::inner_product_forward::primitive_desc& pd) {
-    src_md = DnnlExtensionUtils::makeDescriptor(pd.src_desc());
-    dst_md = DnnlExtensionUtils::makeDescriptor(pd.dst_desc());
-    wghts_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc());
-    scrch_md = DnnlExtensionUtils::makeDescriptor(pd.scratchpad_desc());
-    execPrim = dnnl::inner_product_forward(pd);
-}
-
-FullyConnected::ExecutorConv1x1::ExecutorConv1x1(const dnnl::convolution_forward::primitive_desc& pd) {
-    src_md = DnnlExtensionUtils::makeDescriptor(pd.src_desc());
-    dst_md = DnnlExtensionUtils::makeDescriptor(pd.dst_desc());
-    wghts_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc());
-    scrch_md = DnnlExtensionUtils::makeDescriptor(pd.scratchpad_desc());
-    execPrim = dnnl::convolution_forward(pd);
-}
-
 MemoryPtr FullyConnected::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) {
     if (!getParentEdgeAt(1)->getParent()->isConstant())
         IE_THROW() << "Weight input is not const for node " << getName() << ".";
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index 30fbbcb38b9884..3f0983f2fc2a77 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -90,16 +90,6 @@ class FullyConnected : public Node {
     std::unordered_map<std::string, MemoryPtr> privateWeightCache;
     dnnl::primitive_attr attr;
 
-    class ExecutorInnerProduct : public DnnlExecutor {
-        public:
-            ExecutorInnerProduct(const dnnl::inner_product_forward::primitive_desc& pd);
-    };
-
-    class ExecutorConv1x1 : public DnnlExecutor {
-        public:
-            ExecutorConv1x1(const dnnl::convolution_forward::primitive_desc& pd);
-    };
-
     static dnnl::convolution_forward::primitive_desc
     createDescriptorInternalForConv(DnnlMemoryDescCPtr inputDescPtr,
                                     DnnlMemoryDescCPtr weightDescPtr,
@@ -116,8 +106,6 @@ class FullyConnected : public Node {
     float minSparseRate = 1.f;
     float weiSparseRate = 0.f;
     bool useSparseWeightsDecompression();
-
-    MemoryPtr scratchPad;
 };
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h
index d3c05b721da6f0..71ae6b91e7660c 100644
--- a/src/plugins/intel_cpu/src/nodes/input.h
+++ b/src/plugins/intel_cpu/src/nodes/input.h
@@ -31,6 +31,7 @@ class Input : public Node {
     void withMeanImage();
     MemoryCPtr getMemoryPtr() const;
 
+    void execute(dnnl::stream strm) override {}
     void executeDynamicImpl(dnnl::stream strm) override {}
     bool isExecutable() const override {
         return false;
diff --git a/src/plugins/intel_cpu/src/nodes/interaction.h b/src/plugins/intel_cpu/src/nodes/interaction.h
index 661cfc22de8b88..122ae3b2addc8c 100644
--- a/src/plugins/intel_cpu/src/nodes/interaction.h
+++ b/src/plugins/intel_cpu/src/nodes/interaction.h
@@ -60,6 +60,7 @@ class Interaction : public Node {
 
 private:
     void execRef(dnnl::stream strm);
+    dnnl::primitive prim;
     size_t batchSize = 0;
     size_t featureSize = 0;
     size_t inputSizes = 0;
diff --git a/src/plugins/intel_cpu/src/nodes/lrn.cpp b/src/plugins/intel_cpu/src/nodes/lrn.cpp
index 5cc0dce6230eae..f5f8995626d3e4 100644
--- a/src/plugins/intel_cpu/src/nodes/lrn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/lrn.cpp
@@ -182,7 +182,7 @@ void Lrn::prepareParams() {
     LrnKey key = {inpDesc, selected_pd->getImplementationType(), alg, size, k, alpha, beta, attr};
     auto engine = getEngine();
 
-    auto builder = [&engine](const LrnKey& key) -> dnnl::primitive {
+    auto builder = [&engine](const LrnKey& key) -> executorPtr {
         auto desc = std::make_shared<dnnl::lrn_forward::primitive_desc>(
             engine,
             dnnl::prop_kind::forward_inference,
@@ -205,25 +205,24 @@ void Lrn::prepareParams() {
                 break;
             }
             if (!itpd.next_impl())
-                return dnnl::lrn_forward();
+                return nullptr;
         }
 
-        return dnnl::lrn_forward(prim_desc);
+        return std::make_shared<DnnlExecutor>(prim_desc);
     };
 
     auto cache = context->getParamsCache();
     auto result = cache->getOrCreate(key, builder);
-    if (!result.first) {
+    execPtr = result.first;
+    if (!execPtr) {
         IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
     }
-    prim = result.first;
 
-    auto pd = prim.get_primitive_desc();
-    auto scratchpadMem = getScratchPadMem(pd);
+    auto scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
 
-    auto src = srcMemPtr->GetPrimitive();
-    auto dst = dstMemPtr->GetPrimitive();
-    primArgs = { {DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}, {DNNL_ARG_SCRATCHPAD, scratchpadMem->GetPrimitive()} };
+    primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
+    primArgs[DNNL_ARG_SRC] = srcMemPtr->GetPrimitive();
+    primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();
 }
 
 bool Lrn::created() const {
@@ -250,6 +249,14 @@ void Lrn::createDescriptor(const std::vector<MemoryDescPtr> &inputDesc,
     descs.push_back(desc);
 }
 
+void Lrn::execute(dnnl::stream strm) {
+    if (execPtr) {
+        execPtr->exec(primArgs, strm);
+    } else {
+        IE_THROW() << errorPrefix << " doesn't have an initialized executor";
+    }
+}
+
 void Lrn::executeDynamicImpl(dnnl::stream strm) {
     execute(strm);
 }
diff --git a/src/plugins/intel_cpu/src/nodes/lrn.h b/src/plugins/intel_cpu/src/nodes/lrn.h
index b821fa8b70e521..c1635261f70faf 100644
--- a/src/plugins/intel_cpu/src/nodes/lrn.h
+++ b/src/plugins/intel_cpu/src/nodes/lrn.h
@@ -9,6 +9,7 @@
 #include <string>
 #include <memory>
 #include <vector>
+#include "common/dnnl_executor.h"
 
 namespace ov {
 namespace intel_cpu {
@@ -31,11 +32,14 @@ class Lrn : public Node {
     }
 
     void prepareParams() override;
+    void execute(dnnl::stream strm) override;
     void executeDynamicImpl(dnnl::stream strm) override;
 
     static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
 
 private:
+    using executorPtr = std::shared_ptr<DnnlExecutor>;
+    executorPtr execPtr = nullptr;
     dnnl::algorithm alg;
     size_t size = 1;
     int k = 1;
diff --git a/src/plugins/intel_cpu/src/nodes/matmul.cpp b/src/plugins/intel_cpu/src/nodes/matmul.cpp
index 83e4bd1a179294..5bfe42500adb15 100644
--- a/src/plugins/intel_cpu/src/nodes/matmul.cpp
+++ b/src/plugins/intel_cpu/src/nodes/matmul.cpp
@@ -613,7 +613,7 @@ void MatMul::prepareParams() {
 
     auto engine = getEngine();
 
-    auto builder = [&engine](const MatMulKey& key) -> dnnl::primitive {
+    auto builder = [&engine](const MatMulKey& key) -> executorPtr {
         dnnl::matmul::primitive_desc matmul_desc;
 
         if (key.bias) {
@@ -653,22 +653,20 @@ void MatMul::prepareParams() {
                 break;
             }
         }
-        return matmul(prim_desc);
+        return std::make_shared<DnnlExecutor>(prim_desc);
     };
 
     auto cache = context->getParamsCache();
     auto result = cache->getOrCreate(key, builder);
 
-    if (!result.first) {
+    execPtr = result.first;
+    if (!execPtr) {
         IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
     }
 
-    prim = result.first;
+    auto schratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
 
-    auto pd = prim.get_primitive_desc();
-    auto scratchpadMem = getScratchPadMem(pd);
-
-    primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
+    primArgs[DNNL_ARG_SCRATCHPAD] = schratchpadMem->GetPrimitive();
     primArgs[DNNL_ARG_SRC_0] = src0MemPtr->GetPrimitive();
     primArgs[DNNL_ARG_WEIGHTS_0] = src1MemPtr->GetPrimitive();
     primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();
@@ -678,6 +676,14 @@ void MatMul::prepareParams() {
     appendPostOpArgs(*attr, primArgs, postOpsArgs);
 }
 
+void MatMul::execute(dnnl::stream strm) {
+    if (execPtr) {
+        execPtr->exec(primArgs, strm);
+    } else {
+        IE_THROW() << errorPrefix << " doesn't have an initialized executor";
+    }
+}
+
 void MatMul::executeDynamicImpl(dnnl::stream strm) {
     execute(strm);
 }
diff --git a/src/plugins/intel_cpu/src/nodes/matmul.h b/src/plugins/intel_cpu/src/nodes/matmul.h
index 5c8902483972b8..16d2140cbe5eee 100644
--- a/src/plugins/intel_cpu/src/nodes/matmul.h
+++ b/src/plugins/intel_cpu/src/nodes/matmul.h
@@ -10,6 +10,7 @@
 #include <vector>
 #include <array>
 #include "memory_desc/dnnl_blocked_memory_desc.h"
+#include "common/dnnl_executor.h"
 
 namespace ov {
 namespace intel_cpu {
@@ -38,6 +39,7 @@ class MatMul : public Node {
     }
 
     void prepareParams() override;
+    void execute(dnnl::stream strm) override;
     void executeDynamicImpl(dnnl::stream strm) override;
 
     static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
@@ -48,6 +50,8 @@ class MatMul : public Node {
     AttrPtr initPrimitiveAttr(const VectorDims& dims);
 
 private:
+    using executorPtr = std::shared_ptr<DnnlExecutor>;
+    executorPtr execPtr = nullptr;
     dnnl::memory::desc getBiasDescFrom(const DnnlMemoryDescCPtr outMemDesc);
     std::pair<Shape, Shape> makeDummyInputShapes(const Shape& in0, const Shape& in1) const;
 
diff --git a/src/plugins/intel_cpu/src/nodes/pooling.cpp b/src/plugins/intel_cpu/src/nodes/pooling.cpp
index fc56f8d812ce54..b31c358911904a 100644
--- a/src/plugins/intel_cpu/src/nodes/pooling.cpp
+++ b/src/plugins/intel_cpu/src/nodes/pooling.cpp
@@ -369,7 +369,7 @@ void Pooling::prepareParams() {
                       alg,
                       selected_pd->getImplementationType()};
     auto engine = getEngine();
-    auto builder = [&engine](const PoolingKey& key) -> dnnl::primitive {
+    auto builder = [&engine](const PoolingKey& key) -> executorPtr {
         primitive_desc_iterator itpd = createDescriptorHelper(engine,
                                                               key.inp->getDnnlDesc(),
                                                               key.out->getDnnlDesc(),
@@ -393,27 +393,34 @@ void Pooling::prepareParams() {
                 break;
         }
 
-        return pooling_forward(prim_desc);
+        return std::make_shared<DnnlExecutor>(prim_desc);
     };
 
     auto cache = context->getParamsCache();
     auto result = cache->getOrCreate(key, builder);
 
-    if (!result.first) {
+    execPtr = result.first;
+
+    if (!execPtr) {
         IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
     }
 
-    prim = result.first;
-
-    auto pd = prim.get_primitive_desc();
-    auto scratchpadMem = getScratchPadMem(pd);
-    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
-    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
-    primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}, {DNNL_ARG_SCRATCHPAD, scratchpadMem->GetPrimitive()}};
+    auto scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
+    primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
+    primArgs[DNNL_ARG_SRC] = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    primArgs[DNNL_ARG_DST] = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
 
     Node::appendPostOpArgs(*attr, primArgs, postOpsArgs);
 }
 
+void Pooling::execute(dnnl::stream strm) {
+    if (execPtr) {
+        execPtr->exec(primArgs, strm);
+    } else {
+        IE_THROW() << "Pooling node with name '" << getName() << "' doesn't have an initialized executor";
+    }
+}
+
 void Pooling::executeDynamicImpl(dnnl::stream strm) {
     execute(strm);
 }
diff --git a/src/plugins/intel_cpu/src/nodes/pooling.h b/src/plugins/intel_cpu/src/nodes/pooling.h
index 2daaa3f9a528e8..6d76e3d48980a2 100644
--- a/src/plugins/intel_cpu/src/nodes/pooling.h
+++ b/src/plugins/intel_cpu/src/nodes/pooling.h
@@ -10,6 +10,7 @@
 #include <string>
 #include <memory>
 #include <vector>
+#include "common/dnnl_executor.h"
 
 namespace ov {
 namespace intel_cpu {
@@ -30,6 +31,7 @@ class Pooling : public Node {
     }
 
     void prepareParams() override;
+    void execute(dnnl::stream strm) override;
     void executeDynamicImpl(dnnl::stream strm) override;
 
     static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
@@ -38,6 +40,9 @@ class Pooling : public Node {
     AttrPtr initPrimitiveAttr() override;
 
 private:
+    using executorPtr = std::shared_ptr<DnnlExecutor>;
+    executorPtr execPtr = nullptr;
+
     void setPostOps(dnnl::primitive_attr &attr);
 
     void initEffectiveAttributes(const Shape &inDims, const Shape &outDims);
diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp
index 2efcd0e44b6e69..5dd5674abd9814 100644
--- a/src/plugins/intel_cpu/src/nodes/reorder.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp
@@ -336,7 +336,11 @@ void Reorder::execute(dnnl::stream strm) {
         src_blocked->setDataHandle(getParentEdgeAt(0)->getMemory().GetData());
         dst_blocked->setDataHandle(getChildEdgeAt(0)->getMemory().GetData());
 
-        Node::execute(strm);
+        if (prim) {
+            prim.execute(strm, primArgs);
+        } else {
+            IE_THROW() << "Reorder node with name " << getName() << " doesn't have an initialized primitive";
+        }
     }
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h
index f6091a6c91bd43..4bd3fa8fc3211b 100644
--- a/src/plugins/intel_cpu/src/nodes/reorder.h
+++ b/src/plugins/intel_cpu/src/nodes/reorder.h
@@ -66,6 +66,7 @@ class Reorder : public Node {
     static void reorderData(const Memory &input, const Memory &output, MultiCachePtr cache = nullptr);
 
 private:
+    dnnl::reorder::primitive prim;
     std::shared_ptr<MemoryDesc> input;
     std::shared_ptr<MemoryDesc> output;
 
diff --git a/src/plugins/intel_cpu/src/nodes/rnn.cpp b/src/plugins/intel_cpu/src/nodes/rnn.cpp
index e7b97b9355d214..4ed7ed7a4e5550 100644
--- a/src/plugins/intel_cpu/src/nodes/rnn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/rnn.cpp
@@ -1062,7 +1062,7 @@ void RNN::prepareParams() {
     RNNKey key = { inDataDescs, outDataDescs, wDescs, cell_type, cell_act, direction, *attr };
 
     auto engine = getEngine();
-    auto builder = [&engine](const RNNKey& key) -> dnnl::primitive {
+    auto builder = [&engine](const RNNKey& key) -> executorPtr {
         const auto descPtr = createPrimitiveDescriptor(engine,
                                                        key.cellType,
                                                        key.cellAct,
@@ -1072,23 +1072,22 @@ void RNN::prepareParams() {
                                                        key.wDescs,
                                                        key.attr);
 
-        return dnnl::primitive(descPtr);
+        return std::make_shared<DnnlExecutor>(descPtr);
     };
 
     auto cache = context->getParamsCache();
     auto result = cache->getOrCreate(key, builder);
 
-    if (!result.first) {
+    execPtr = result.first;
+
+    if (!execPtr) {
         IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
     }
 
-    prim = result.first;
-
-    auto pd = prim.get_primitive_desc();
-    scratchpadMem = getScratchPadMem(pd);
+    scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
 
     if (!wasMemoryPrepared || wFormatWasChanged) {
-        auto pd = prim.get_primitive_desc();
+        auto pd = execPtr->getPrimitiveDesc();
         auto query_weights_md = [&](int idx = 0) -> dnnl::memory::desc {
             auto what = dnnl::convert_to_c(dnnl::query::weights_md);
             const_dnnl_memory_desc_t cdesc = dnnl_primitive_desc_query_md(pd, what, idx);
@@ -1118,7 +1117,7 @@ std::shared_ptr<MemoryDesc> RNN::getDstMemDesc(dnnl::primitive_desc_iterator& pr
 }
 
 void RNN::execute(dnnl::stream strm) {
-    if (!prim)
+    if (!execPtr)
         THROW_ERROR << "does not have initialized primitive to execute.";
 
     const auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
@@ -1160,7 +1159,7 @@ void RNN::execute(dnnl::stream strm) {
         }
     }
 
-    prim.execute(strm, args);
+    execPtr->exec(args, strm);
 }
 
 void RNN::executeDynamicImpl(dnnl::stream strm) {
diff --git a/src/plugins/intel_cpu/src/nodes/rnn.h b/src/plugins/intel_cpu/src/nodes/rnn.h
index b94d026adcf75c..dbe4f9769d14b7 100644
--- a/src/plugins/intel_cpu/src/nodes/rnn.h
+++ b/src/plugins/intel_cpu/src/nodes/rnn.h
@@ -11,6 +11,8 @@
 #include <memory>
 #include <vector>
 
+#include "common/dnnl_executor.h"
+
 namespace ov {
 namespace intel_cpu {
 namespace node {
@@ -66,6 +68,9 @@ class RNN : public Node {
 
     void copyWeightsData();
 
+    using executorPtr = std::shared_ptr<DnnlExecutor>;
+    executorPtr execPtr = nullptr;
+
     /** Specify mode Cell or Seq. true - Cell, false - Seq */
     bool is_cell = false;
 
diff --git a/src/plugins/intel_cpu/src/nodes/softmax.cpp b/src/plugins/intel_cpu/src/nodes/softmax.cpp
index 7f3d3c337e5792..65176e4a7c7907 100644
--- a/src/plugins/intel_cpu/src/nodes/softmax.cpp
+++ b/src/plugins/intel_cpu/src/nodes/softmax.cpp
@@ -170,7 +170,7 @@ void SoftMax::prepareParams() {
     SoftmaxKey key = {inpDesc, selected_pd->getImplementationType(), axis, *attr};
     auto engine = getEngine();
 
-    auto builder = [&engine](const SoftmaxKey& key) -> dnnl::primitive {
+    auto builder = [&engine](const SoftmaxKey& key) -> executorPtr {
         softmax_forward::primitive_desc prim_desc;
         auto desc = std::make_shared<softmax_forward::primitive_desc>(
             engine,
@@ -196,26 +196,32 @@ void SoftMax::prepareParams() {
                 break;
             }
             if (!itpd.next_impl())
-                return softmax_forward();
+                return nullptr;
         }
-        return softmax_forward(prim_desc);
+        return std::make_shared<DnnlExecutor>(prim_desc);
     };
 
     auto cache = context->getParamsCache();
     auto result = cache->getOrCreate(key, builder);
 
-    if (!result.first) {
+    execPtr = result.first;
+    if (!execPtr) {
         IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
     }
 
-    prim = result.first;
+    auto scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
 
-    auto pd = prim.get_primitive_desc();
-    auto scratchpadMem = getScratchPadMem(pd);
+    primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
+    primArgs[DNNL_ARG_SRC] = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    primArgs[DNNL_ARG_DST] = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+}
 
-    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
-    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
-    primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}, {DNNL_ARG_SCRATCHPAD, scratchpadMem->GetPrimitive()}};
+void SoftMax::execute(dnnl::stream strm) {
+    if (execPtr) {
+        execPtr->exec(primArgs, strm);
+    } else {
+        IE_THROW() << "Softmax node with name '" << getName() << "' doesn't have an initialized executor";
+    }
 }
 
 void SoftMax::executeDynamicImpl(dnnl::stream strm) {
diff --git a/src/plugins/intel_cpu/src/nodes/softmax.h b/src/plugins/intel_cpu/src/nodes/softmax.h
index 78fc51115a18d7..1a472075168406 100644
--- a/src/plugins/intel_cpu/src/nodes/softmax.h
+++ b/src/plugins/intel_cpu/src/nodes/softmax.h
@@ -11,6 +11,8 @@
 #include <memory>
 #include <vector>
 
+#include "common/dnnl_executor.h"
+
 namespace ov {
 namespace intel_cpu {
 namespace node {
@@ -26,11 +28,14 @@ class SoftMax : public Node {
     bool created() const override;
     AttrPtr initPrimitiveAttr() override;
     void prepareParams() override;
+    void execute(dnnl::stream strm) override;
     void executeDynamicImpl(dnnl::stream strm) override;
 
     static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
 
 private:
+    using executorPtr = std::shared_ptr<DnnlExecutor>;
+    executorPtr execPtr = nullptr;
     size_t axis = 0;
 };
 
diff --git a/src/plugins/intel_cpu/src/nodes/transpose.h b/src/plugins/intel_cpu/src/nodes/transpose.h
index b13bc1a0a745ab..03988d24fe8367 100644
--- a/src/plugins/intel_cpu/src/nodes/transpose.h
+++ b/src/plugins/intel_cpu/src/nodes/transpose.h
@@ -48,6 +48,7 @@ class Transpose : public Node {
     };
     using executorPtr = std::shared_ptr<TransposeExecutor>;
     executorPtr execPtr = nullptr;
+    dnnl::primitive prim;
 
     struct TransposeJitExecutor : public TransposeExecutor {
         TransposeJitExecutor(const PermuteParams& params);

From 765f5e1a0044ebb52b056202df6571db1aa51fcb Mon Sep 17 00:00:00 2001
From: Maksim Kutakov <maksim.kutakov@intel.com>
Date: Tue, 21 Mar 2023 18:41:08 +0100
Subject: [PATCH 5/5] Use stored md for interim reorders

---
 src/plugins/intel_cpu/src/nodes/conv.cpp   | 12 ++++++------
 src/plugins/intel_cpu/src/nodes/deconv.cpp | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp
index 002af7c98b24f4..25655838aa960c 100644
--- a/src/plugins/intel_cpu/src/nodes/conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/conv.cpp
@@ -1513,16 +1513,16 @@ Convolution::ConvolutionExecutor::ConvolutionExecutor(const dnnl::convolution_fo
                                                                 const dnnl::memory::desc& weightMemDesc,
                                                                 const dnnl::memory::desc& outMemDesc,
                                                                 const dnnl::engine& engine) : DnnlExecutor(pd) {
-    if (inMemDesc != pd.src_desc()) {
-        inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, pd.src_desc(), engine)});
+    if (inMemDesc != getDnnlSrcDesc()) {
+        inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)});
     }
 
-    if (weightMemDesc != pd.weights_desc()) {
-        inputReorders.insert({DNNL_ARG_WEIGHTS, IntermReorder(weightMemDesc, pd.weights_desc(), engine)});
+    if (weightMemDesc != getDnnlWeightDesc()) {
+        inputReorders.insert({DNNL_ARG_WEIGHTS, IntermReorder(weightMemDesc, getDnnlWeightDesc(), engine)});
     }
 
-    if (outMemDesc != pd.dst_desc()) {
-        outputReorders.insert({DNNL_ARG_DST, IntermReorder(pd.dst_desc(), outMemDesc, engine)});
+    if (outMemDesc != getDnnlDstDesc()) {
+        outputReorders.insert({DNNL_ARG_DST, IntermReorder(getDnnlDstDesc(), outMemDesc, engine)});
     }
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp
index a8dbae4d00a471..2395a4a6af2a8d 100644
--- a/src/plugins/intel_cpu/src/nodes/deconv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp
@@ -1112,16 +1112,16 @@ Deconvolution::DeconvExecutorInt8::DeconvExecutorInt8(const dnnl::deconvolution_
                                                                 const dnnl::memory::desc& weightMemDesc,
                                                                 const dnnl::memory::desc& outMemDesc,
                                                                 const dnnl::engine& engine) : DnnlExecutor(pd) {
-    if (inMemDesc != pd.src_desc()) {
-        inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, pd.src_desc(), engine)});
+    if (inMemDesc != getDnnlSrcDesc()) {
+        inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)});
     }
 
-    if (weightMemDesc != pd.weights_desc()) {
-        inputReorders.insert({DNNL_ARG_WEIGHTS, IntermReorder(weightMemDesc, pd.weights_desc(), engine)});
+    if (weightMemDesc != getDnnlWeightDesc()) {
+        inputReorders.insert({DNNL_ARG_WEIGHTS, IntermReorder(weightMemDesc, getDnnlWeightDesc(), engine)});
     }
 
-    if (outMemDesc != pd.dst_desc()) {
-        outputReorders.insert({DNNL_ARG_DST, IntermReorder(pd.dst_desc(), outMemDesc, engine)});
+    if (outMemDesc != getDnnlDstDesc()) {
+        outputReorders.insert({DNNL_ARG_DST, IntermReorder(getDnnlDstDesc(), outMemDesc, engine)});
     }
 }