From d62effb86b50781efa24af18a8af77bef9bd11db Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Fri, 6 Dec 2024 13:24:22 +0800
Subject: [PATCH 01/23] [CPU] Optimize small batch case for PagedAttention
 (#27847)

### Details:
 - *Generate more work items to avoid thread imbalance*
 - *...*

### Tickets:
 - *[156347](https://jira.devtools.intel.com/browse/CVS-156347)*
 - *[158477](https://jira.devtools.intel.com/browse/CVS-158477)*
---
 .../nodes/kernels/scaled_attn/executor_pa.cpp | 105 ++++++++++++++----
 1 file changed, 86 insertions(+), 19 deletions(-)
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
index bef34881ca41bc..90167ac86a8e1a 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
@@ -939,14 +939,14 @@ struct MHAHelper {
     //  wv_scratch_b: [rnd_up(kv_len, block_size), Hk, scratch_b_size]
     void exec_kernel_multiple(const PlainTensor& query, const PlainTensor& present_value, const PlainTensor& output_emb,
         const PlainTensor& qk_scratch_b, const PlainTensor& wv_scratch_b, const int32_t* block_table, size_t ithr, size_t q_blk,
-        size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) {
+        size_t hq_beg, size_t hq_end, size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) {
         auto q_start = q_blk * _block_size;
         auto q_end = std::min(q_start + _block_size, q_len);
         auto q_cnt = q_end - q_start;
         constexpr bool q_is_xf16 = one_of(precision_of<DATA_TYPE>::value, ov::element::bf16, ov::element::f16);
         constexpr bool q_cache_is_same = precision_of<DATA_TYPE>::value == precision_of<KVCACHE_TYPE>::value;
         auto cur_kv_len_blocks = div_up(cur_kv_len, _block_size);
-        for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
+        for (size_t h = hq_beg; h < hq_end; h++) {
             auto* q_ptr = query.ptr<DATA_TYPE>(h, q_start, 0);
             float* c_ptr = _weight.ptr<float>(ithr, h, 0, 0);
             // for each query block, loop through all key block
@@ -1065,13 +1065,14 @@ struct MHAHelper {
     //  weight: [nthr, H, 32, rnd_up(kv_len, block_size)]
     //  output: [nthr, 32, H, S]
     void exec_kernel_one_bh(const PlainTensor& query, const PlainTensor& present_key, const PlainTensor& present_value, const PlainTensor& output_emb,
-        const int32_t* block_table, size_t ithr, size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) {
+        const int32_t* block_table, size_t ithr, size_t hq_beg, size_t hq_end, size_t hk,
+        size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) {
         if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) {
             _gemv->tile_config();
             for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) {
                 auto block_number = block_table[i];
                 for (size_t pq = 0; pq < q_len; pq++) {
-                    for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
+                    for (size_t h = hq_beg; h < hq_end; h++) {
                         (*_gemv)(query.ptr<DATA_TYPE>(h, pq), present_key.ptr<KVCACHE_TYPE>(block_number, hk),
                             _weight.ptr<float>(ithr, h, pq) + pk);
                     }
@@ -1082,7 +1083,7 @@ struct MHAHelper {
             for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) {
                 auto block_number = block_table[i];
                 for (size_t pq = 0; pq < q_len; pq++) {
-                    for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
+                    for (size_t h = hq_beg; h < hq_end; h++) {
                         dot_product_block(query.ptr<DATA_TYPE>(h, pq), present_key.ptr<KVCACHE_TYPE>(block_number, hk),
                             _weight.ptr<float>(ithr, h, pq) + pk, _S, std::min(_block_size, cur_kv_len - pk));
                     }
@@ -1091,7 +1092,7 @@ struct MHAHelper {
         }
 
         for (size_t pq = 0; pq < q_len; pq++) {
-            for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
+            for (size_t h = hq_beg; h < hq_end; h++) {
                 // apply attention mask & sofmax
                 float* alibi_lookup = nullptr;
                 float alibi_slope = 0.f;
@@ -1122,7 +1123,7 @@ struct MHAHelper {
             auto block_number = block_table[i];
             auto* v = present_value.ptr<KVCACHE_TYPE>(block_number, hk);
             for (size_t pq = 0; pq < q_len; pq++) {
-                for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
+                for (size_t h = hq_beg; h < hq_end; h++) {
                     attn_acc_value_block(_output.ptr<float>(ithr, pq, h),
                                          _weight.ptr<float>(ithr, h, pq) + pv,
                                          v,
@@ -1133,7 +1134,7 @@ struct MHAHelper {
         }
         // convert to dst
         for (size_t pq = 0; pq < q_len; pq++)
-            for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++)
+            for (size_t h = hq_beg; h < hq_end; h++)
                 cvt_copy(output_emb.ptr<DATA_TYPE>(pq, h * _SV), _output.ptr<float>(ithr, pq, h), _SV);
     }
 
@@ -1162,8 +1163,38 @@ struct MHAHelper {
         // aligned to cache line (64bytes=16*sizeof(float)) to avoid false sharing
         _weight_bhl.resize<float>({B, _H, q_len, rnd_up(max_context_len, std::max(_block_size, size_t{16}))});
 
-        parallel_for3d_dynamic(B, kv_len_in_blocks, _Hk, [&](size_t b, size_t pk_in_blocks, size_t hk) {
+        // for small batches dynamic scheduler has notable overhead
+        bool prefer_static_loop;
+        // if less than 2 work items per thread, loop H
+        bool loop_hk = B * kv_len_in_blocks * _Hk <= 2 * _nthr ? false : true;
+        if (B <= 32) {
+            prefer_static_loop = true;
+            // small batch and all batch size is same(like SDPA case)
+            auto kv_len = past_lens.ptr<int32_t>()[0];
+            for (size_t b = 1; b < B; b++) {
+                if (past_lens.ptr<int32_t>()[b] != kv_len)
+                    prefer_static_loop = false;
+            }
+        } else {
+            // for bigger batch skip the test to save the cost
+            prefer_static_loop = false;
+        }
+        auto get_h_params = [] (bool loop_hk, size_t hx, size_t h_each_group_len, size_t& hq_beg, size_t& hq_end, size_t& hk) {
+            if (loop_hk) {
+                hk = hx;
+                hq_beg = hk * h_each_group_len;
+                hq_end = (hk + 1) * h_each_group_len;
+            } else {
+                hq_beg = hx;
+                hq_end = hx + 1;
+                hk = hx / h_each_group_len;
+            }
+        };
+        auto loop_qk = [&](size_t b, size_t pk_in_blocks, size_t hx) {
             auto context_len = static_cast<size_t>(past_lens.ptr<int32_t>()[b]) + 1;
+            size_t hk, hq_beg, hq_end;
+            get_h_params(loop_hk, hx, _h_each_group_len, hq_beg, hq_end, hk);
+
             // kv_len must be valid
             auto pk = pk_in_blocks * _block_size;
             if (pk < context_len) {
@@ -1171,7 +1202,7 @@ struct MHAHelper {
                 if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) {
                     _gemv->tile_config();
                     for (size_t pq = 0; pq < q_len; pq++) {
-                        for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
+                        for (size_t h = hq_beg; h < hq_end; h++) {
                             (*_gemv)(query.ptr<DATA_TYPE>(b, h, pq), present_key.ptr<KVCACHE_TYPE>(block_number, hk),
                                 _weight_bhl.ptr<float>(b, h, pq) + pk);
                         }
@@ -1179,16 +1210,16 @@ struct MHAHelper {
                     _gemv->tile_release();
                 } else {
                     for (size_t pq = 0; pq < q_len; pq++) {
-                        for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
+                        for (size_t h = hq_beg; h < hq_end; h++) {
                             dot_product_block(query.ptr<DATA_TYPE>(b, h, pq), present_key.ptr<KVCACHE_TYPE>(block_number, hk),
                                 _weight_bhl.ptr<float>(b, h, pq) + pk, _S, std::min(_block_size, context_len - pk));
                         }
                     }
                 }
             }
-        });
+        };
 
-        parallel_for3d_dynamic(B, _H, q_len, [&](size_t b, size_t h, size_t pq) {
+        auto loop_softmax = [&](size_t b, size_t h, size_t pq) {
             auto cur_kv_len = static_cast<size_t>(past_lens.ptr<int32_t>()[b]) + 1;
             auto ncausal = cur_kv_len;
             // apply attention mask & sofmax
@@ -1210,7 +1241,16 @@ struct MHAHelper {
                                        ov::element::f32,
                                        ov::element::f32,
                                        alibi_slope);
-        });
+        };
+
+        size_t h_dims = loop_hk ? _Hk : _H;
+        if (prefer_static_loop) {
+            parallel_for3d(B, kv_len_in_blocks, h_dims, loop_qk);
+            parallel_for3d(B, _H, q_len, loop_softmax);
+        } else {
+            parallel_for3d_dynamic(B, kv_len_in_blocks, h_dims, loop_qk);
+            parallel_for3d_dynamic(B, _H, q_len, loop_softmax);
+        }
 
         if (output_score) {
             parallel_for2d_dynamic(B, q_len, [&](size_t b, size_t pq) {
@@ -1229,16 +1269,19 @@ struct MHAHelper {
             memset(_output_bhl.ptr<float>(ithr, 0, 0, 0, 0), 0, _output_bhl.stride(0) * sizeof(float));
         });
 
-        parallel_for3d_dynamic(B, kv_len_in_blocks, _Hk, [&](size_t b, size_t pv_in_blocks, size_t hk) {
+        auto loop_wk = [&](size_t b, size_t pv_in_blocks, size_t hx) {
             auto ithr = parallel_get_thread_num();
             auto context_len = static_cast<size_t>(past_lens.ptr<int32_t>()[b]) + 1;
             auto pv = pv_in_blocks * _block_size;
+            size_t hk, hq_beg, hq_end;
+            get_h_params(loop_hk, hx, _h_each_group_len, hq_beg, hq_end, hk);
+
             // kv_len must be valid
             if (pv < context_len) {
                 auto block_number = block_indices.ptr<int32_t>()[block_indices_begins.ptr<int32_t>()[b] + pv_in_blocks];
                 auto* v = present_value.ptr<KVCACHE_TYPE>(block_number, hk);
                 for (size_t pq = 0; pq < q_len; pq++) {
-                    for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
+                    for (size_t h = hq_beg; h < hq_end; h++) {
                         attn_acc_value_block(_output_bhl.ptr<float>(ithr, b, pq, h),
                                              _weight_bhl.ptr<float>(b, h, pq) + pv,
                                              v,
@@ -1247,7 +1290,13 @@ struct MHAHelper {
                     }
                 }
             }
-        });
+        };
+
+        if (prefer_static_loop) {
+            parallel_for3d(B, kv_len_in_blocks, loop_hk ? _Hk : _H, loop_wk);
+        } else {
+            parallel_for3d_dynamic(B, kv_len_in_blocks, loop_hk ? _Hk : _H, loop_wk);
+        }
 
         parallel_for3d(B, _H, q_len, [&](size_t b, size_t h, size_t pq) {
             auto* temp = _output_bhl.ptr<float>(0, b, pq, h);
@@ -1416,7 +1465,23 @@ struct MHA {
             }
         });
 
-        parallel_for2d_dynamic(attn_work_count, Hk, [&](size_t w, size_t hk) {
+        // loop along HK dimension: if mixed first/second token and elements count is enough, loop HK to reuse KV in the CPU cache
+        //    else if elements count is small, prefer to loop H to get more work to avoid thread imbalance
+        bool loop_hk = _workitems.get_reorder_max_batch_size() == past_lens.m_dims[0] ||        // if only first token, loop H
+            attn_work_count * Hk <= 2 * _helper._nthr ? false : true;                           // or less than 2 work items per thread, loop H
+
+        parallel_for2d_dynamic(attn_work_count, loop_hk ? Hk : _helper._H, [&](size_t w, size_t hx) {
+            size_t hk, hq_beg, hq_end;
+            if (loop_hk) {
+                hk = hx;
+                hq_beg = hk * _helper._h_each_group_len;
+                hq_end = (hk + 1) * _helper._h_each_group_len;
+            } else {
+                hq_beg = hx;
+                hq_end = hx + 1;
+                hk = hx / _helper._h_each_group_len;
+            }
+
             const auto& item = _workitems.get_attn_work_item(w);
             const auto batch_in_seq = item.batch_in_seq;
             const auto batch_in_token = subsequence_begins.ptr<int32_t>()[batch_in_seq];
@@ -1434,7 +1499,7 @@ struct MHA {
                 _helper.exec_kernel_one_bh(q.slice(0, batch_in_token, batch_in_token), k_cache, v_cache,
                     output_emb.slice(0, batch_in_token, batch_in_token),
                     block_indices.ptr<int32_t>() + block_indices_begins.ptr<int32_t>()[batch_in_seq],
-                    ithr, hk, 1ul, cur_kv_len, alibi_slopes,
+                    ithr, hq_beg, hq_end, hk, 1ul, cur_kv_len, alibi_slopes,
                     score_output);
             } else {
                 const auto batch_in_reorder = item.batch_in_reorder;
@@ -1461,6 +1526,8 @@ struct MHA {
                     block_indices.ptr<int32_t>() + block_indices_begins.ptr<int32_t>()[batch_in_seq],
                     ithr,
                     q_blk,
+                    hq_beg,
+                    hq_end,
                     hk,
                     q_len,
                     cur_kv_len,

From 0dd7434e299bbb2e85a936ab42e9a8bc40729f75 Mon Sep 17 00:00:00 2001
From: Egor Duplenskii <egor.duplensky@gmail.com>
Date: Fri, 6 Dec 2024 06:43:59 +0100
Subject: [PATCH 02/23] [CPU][Refactoring] Introduce VariableExecutor (#27883)

Depending on the parameters a `FullyConnected` node can
use one or multiple executors.
With the current approach, even when just a single executor
is used, every `prepareParams()` (executor::update())
call goes through executor selection routine.

The idea is to avoid such `prepareParams()` overhead for a single
executor scenarious,
which are probably the most common ones.

Thus, split the pipeline input two branches:
- only single simple executor is used and updated
- a `VariableExecutor` is used and updated. `VariableExecutor` contains
  two or more simple executors
---
 .../executors/dnnl/dnnl_fullyconnected.hpp    |   4 +-
 .../src/nodes/executors/executor_factory.hpp  | 201 ++++--------------
 .../fullyconnected_implementations.cpp        |   1 +
 .../src/nodes/executors/graph_emitter.hpp     |  46 +++-
 .../src/nodes/executors/variable_executor.hpp | 140 ++++++++++++
 .../intel_cpu/src/nodes/fullyconnected.cpp    |  18 +-
 .../intel_cpu/src/nodes/fullyconnected.h      |   4 +-
 7 files changed, 232 insertions(+), 182 deletions(-)
 create mode 100644 src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp

diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp
index 3266bf8965c37b..1d078feaa6549b 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp
@@ -8,12 +8,12 @@
 #include <oneapi/dnnl/dnnl.hpp>
 
 #include "cpu_memory.h"
-#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp"
-#include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp"
 #include "nodes/executors/dnnl/dnnl_aliases.hpp"
+#include "nodes/executors/dnnl/dnnl_utils.hpp"
 #include "nodes/executors/executor.hpp"
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "nodes/executors/memory_arguments.hpp"
+#include "post_ops.hpp"
 
 namespace ov {
 namespace intel_cpu {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp
index 419ab4abf52cd7..f12795d5d1eb16 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2022 Intel Corporation
+// Copyright (C) 2018-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -6,50 +6,22 @@
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 
 #include "executor.hpp"
-#include "nodes/executors/implementations.hpp"
 #include "nodes/executors/executor_config.hpp"
 #include "nodes/executors/executor_implementation.hpp"
 #include "nodes/executors/graph_emitter.hpp"
+#include "nodes/executors/implementations.hpp"
 #include "nodes/executors/memory_arguments.hpp"
 #include "nodes/executors/printers.hpp"
-#include "openvino/core/except.hpp"
+#include "nodes/executors/variable_executor.hpp"
 #include "post_ops.hpp"
 
 namespace ov {
 namespace intel_cpu {
 using namespace executor;
 
-template <typename Attrs, typename NodeT>
-static ExecutorPtr fallback(const executor::Config<Attrs>& config,
-                            const executor::Config<Attrs>& fallbackConfig,
-                            const MemoryArgs& memory,
-                            const ExecutorContext::CPtr context,
-                            const std::string& name) {
-    DEBUG_LOG("Falling back to graph executor for ",
-              name,
-              ". Original config: ",
-              config,
-              " new config:",
-              fallbackConfig);
-
-    GraphEmitter<Attrs> graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name);
-
-    const auto& graphExecutor =
-        graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context)
-            .ensureAttrsMatch()
-            .ensureSrcDescsMatch()
-            .ensureDstDescsMatch()
-            .ensurePostOpsMatch()
-            .emit();
-    (void)graphExecutor;
-
-    OPENVINO_THROW("Fallback logic is not implemented yet");  // return graphExecutor;
-}
-
-template <typename Attrs, typename NodeT>
+template <typename Attrs>
 class ExecutorFactory {
 public:
     using ExecutorImplementationRef = std::reference_wrapper<const ExecutorImplementation<Attrs>>;
@@ -62,9 +34,7 @@ class ExecutorFactory {
         : m_attrs(attrs),
           m_postOps(postOps),
           m_context(context),
-          m_suitableImplementations(filter(m_attrs, m_postOps, descriptors, implementationPriority)),
-          m_implementationRequiresFallback(m_suitableImplementations.size(), true),
-          m_executors(m_suitableImplementations.size()) {}
+          m_suitableImplementations(filter(m_attrs, m_postOps, descriptors, implementationPriority)) {}
 
     /**
      * @brief Retrieves the proper memory descriptors based on the provided memory descriptors.
@@ -95,104 +65,42 @@ class ExecutorFactory {
     }
 
     /**
-     * @brief Preconfigures an executor based on the provided memory arguments.
-     *
-     * Preconfigures an executor by selecting an appropriate implementation based on the provided
-     * memory arguments and by creating an executor using the implementation.
-     *
-     * @param memory The memory parameters used for selecting the appropriate executor implementation.
-     *
-     * @note The main use case is to offload executor data preparation (i.e. weights packing)
-     *       From the make() call
-     * @todo Currently supports creating a single executor.
-     *       For some nodes it can be worth to preconfigure all the executors.
-     */
-    void preconfigure(const MemoryArgs& memory) {
-        executor::Config<Attrs> config{memoryDescsFromMemory(memory), m_attrs, m_postOps};
-
-        cacheFallbackStatus(config);
-
-        const size_t implId = select(memory, 0);
-        const auto& impl = m_suitableImplementations[implId].get();
-        DEBUG_LOG("Preconfiguring executor: ", impl.name());
-
-        if (m_implementationRequiresFallback[implId]) {
-            if (auto fallbackConfig = impl.requiresFallback(config)) {
-                fallback<Attrs, NodeT>(config, *fallbackConfig, memory, m_context, impl.name());
-            }
-        }
-
-        (void)create(implId, memory, m_context);
-    }
-
-    /**
-     * @brief Creates an Executor instance based on provided memory arguments.
+     * @brief Creates an Executor instance based on the provided memory arguments.
      *
-     * Creates an Executor instance using the provided MemoryArgs, selecting an appropriate implementation
-     * based on the characteristics of the memory. It handles fallback scenarios if necessary and updates the executor
-     * with the given memory information.
+     * Depending on the number of available implementations, returns:
+     * - VariableExecutor, if the number of implementations is two or more
+     * - Simple Executor, if there is only one available implementation
      *
      * @param memory memory arguments.
      *
      * @return A shared pointer to the created Executor.
-     *
-     * The function follows the steps below:
-     * - Selects an implementation based on the provided memory using the select() function.
-     * - Retrieves the selected implementation and checks if fallback is required.
-     * - If fallback is required, it creates a fallback configuration and returns a fallback executor.
-     * - Otherwise creates the executor using the selected implementation.
-     * - Updates the executor with the given memory information.
-     *
      */
-    ExecutorPtr make(MemoryArgs& memory) {
-        auto createExec = [this](MemoryArgs& memory, size_t implId) -> ExecutorPtr {
-            const auto& impl = m_suitableImplementations[implId].get();
-            if (m_implementationRequiresFallback[implId]) {
-                executor::Config<Attrs> config{memoryDescsFromMemory(memory), m_attrs, m_postOps};
-                if (auto fallbackConfig = impl.requiresFallback(config)) {
-                    return fallback<Attrs, NodeT>(config, *fallbackConfig, memory, m_context, impl.name());
-                }
-            }
-            const auto executor = create(implId, memory, m_context);
-            if (!executor->update(memory)) {
-                return nullptr;
+    ExecutorPtr make(const MemoryArgs& memory) {
+        // only single executor is available
+        if (m_suitableImplementations.size() == 1) {
+            auto config = GraphEmitter<Attrs>::createConfig(memory, m_attrs, m_postOps);
+
+            const auto& theOnlyImplementation = m_suitableImplementations.front().get();
+
+            if (const auto fallbackConfig = theOnlyImplementation.requiresFallback(config)) {
+                return GraphEmitter<Attrs>::fallback(config,
+                                                     *fallbackConfig,
+                                                     memory,
+                                                     m_context,
+                                                     theOnlyImplementation.name());
             }
-            return executor;
-        };
-
-        auto implId = select(memory, 0);
-        auto executor = createExec(memory, implId);
-        while (!executor) {
-            implId = select(memory, ++implId);
-            executor = createExec(memory, implId);
-        }
-        return executor;
-    }
 
-private:
-    static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) {
-        MemoryDescArgs memoryDescs;
-        memoryDescs.reserve(memory.size());
-
-        for (const auto& mem : memory) {
-            memoryDescs[mem.first] = mem.second->getDescPtr();
+            return theOnlyImplementation.create(m_attrs, m_postOps, memory, m_context);
         }
 
-        return memoryDescs;
-    }
-
-    /**
-     * @brief Caches the fallback status for each suitable implementation.
-     */
-    void cacheFallbackStatus(const executor::Config<Attrs>& config) {
-        std::transform(m_suitableImplementations.begin(),
-                       m_suitableImplementations.end(),
-                       m_implementationRequiresFallback.begin(),
-                       [&config](const ExecutorImplementationRef& impl) {
-                           return impl.get().requiresFallback(config);
-                       });
+        return std::make_shared<VariableExecutor<Attrs>>(memory,
+                                                         m_attrs,
+                                                         m_postOps,
+                                                         m_context,
+                                                         m_suitableImplementations);
     }
 
+private:
     /**
      * @brief Filters and retrieves suitable implementations based on the provided executor configuration.
      *
@@ -205,11 +113,10 @@ class ExecutorFactory {
      * @note If an implementation is shape agnostic, no further implementations with lower
      *       priority are considered.
      */
-    static std::vector<ExecutorImplementationRef> filter(
-        const Attrs& attrs,
-        const PostOps& postOps,
-        const MemoryDescArgs& descs,
-        const std::string& implementationPriority = {}) {
+    static std::vector<ExecutorImplementationRef> filter(const Attrs& attrs,
+                                                         const PostOps& postOps,
+                                                         const MemoryDescArgs& descs,
+                                                         const std::string& implementationPriority = {}) {
         const auto& implementations = getImplementations<Attrs>();
         std::vector<ExecutorImplementationRef> suitableImplementations;
         const executor::Config<Attrs> config{descs, attrs, postOps};
@@ -244,51 +151,17 @@ class ExecutorFactory {
         return suitableImplementations;
     }
 
-    size_t select(const MemoryArgs& memory, const size_t startIdx) const {
-        OPENVINO_ASSERT(startIdx < m_suitableImplementations.size(),
-            "Failed to find an implementation since start indx: ", startIdx,
-            " is out of range of the suitable implementations array: ", m_suitableImplementations.size());
-        auto startIt = m_suitableImplementations.begin();
-        std::advance(startIt, startIdx);
-        const auto selectedImplementation =
-            std::find_if(startIt,
-                         m_suitableImplementations.end(),
-                         [&memory](const ExecutorImplementationRef& implementation) {
-                             return implementation.get().shapeAgnostic() || implementation.get().acceptsShapes(memory);
-                         });
-        OPENVINO_ASSERT(selectedImplementation != m_suitableImplementations.end(), "Failed to select an implemetation");
-
-        return std::distance(m_suitableImplementations.begin(), selectedImplementation);
-    }
-
-    ExecutorPtr create(const size_t implId,
-                       const MemoryArgs& memory,
-                       const ExecutorContext::CPtr context) {
-        assert(implId < m_executors.size() && implId < m_suitableImplementations.size());
-
-        if (!m_executors[implId]) {
-            const auto& impl = m_suitableImplementations[implId].get();
-            m_executors[implId] = impl.create(m_attrs, m_postOps, memory, context);
-        }
-
-        return m_executors[implId];
-    }
-
     const Attrs& m_attrs;
     const PostOps& m_postOps;
     const ExecutorContext::CPtr m_context;
     std::vector<ExecutorImplementationRef> m_suitableImplementations;
-    // stores fallback status to avoid performing the check for every make() call
-    std::vector<bool> m_implementationRequiresFallback;
-    // executors cache
-    std::vector<ExecutorPtr> m_executors;
 };
 
-template <typename Attrs, typename NodeT>
-using ExecutorFactoryPtr = std::shared_ptr<ExecutorFactory<Attrs, NodeT>>;
+template <typename Attrs>
+using ExecutorFactoryPtr = std::shared_ptr<ExecutorFactory<Attrs>>;
 
-template <typename Attrs, typename NodeT>
-using ExecutorFactoryCPtr = std::shared_ptr<const ExecutorFactory<Attrs, NodeT>>;
+template <typename Attrs>
+using ExecutorFactoryCPtr = std::shared_ptr<const ExecutorFactory<Attrs>>;
 
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
index 5834c3dda4b262..4cf6992985ecd3 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -11,6 +11,7 @@
 #include "memory_desc/cpu_memory_desc.h"
 #include "nodes/executors/convolution_config.hpp"
 #include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp"
+#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp"
 #include "nodes/executors/dnnl/dnnl_fullyconnected.hpp"
 #include "nodes/executors/dnnl/dnnl_matmul_primitive.hpp"
 #include "nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp"
diff --git a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp
index 6aad18c793c8cf..784ed8bc778840 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp
@@ -5,12 +5,11 @@
 #pragma once
 
 #include <functional>
-#include <vector>
 
 #include "graph.h"
-#include "memory_desc/cpu_memory_desc.h"
 #include "node.h"
 #include "nodes/executors/executor.hpp"
+#include "nodes/executors/executor_config.hpp"
 #include "post_ops.hpp"
 
 namespace ov {
@@ -72,6 +71,49 @@ class GraphEmitter {
         return graph;
     }
 
+    static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) {
+        MemoryDescArgs memoryDescs;
+        memoryDescs.reserve(memory.size());
+
+        for (const auto& mem : memory) {
+            memoryDescs[mem.first] = mem.second->getDescPtr();
+        }
+
+        return memoryDescs;
+    }
+
+    static executor::Config<Attrs> createConfig(const MemoryArgs& memory,
+                                                const Attrs& attrs,
+                                                const PostOps& postOps) {
+        return executor::Config<Attrs>{memoryDescsFromMemory(memory), attrs, postOps};
+    }
+
+    static ExecutorPtr fallback(const executor::Config<Attrs>& config,
+                                const executor::Config<Attrs>& fallbackConfig,
+                                const MemoryArgs& memory,
+                                const ExecutorContext::CPtr context,
+                                const std::string& name) {
+        DEBUG_LOG("Falling back to graph executor for ",
+                  name,
+                  ". Original config: ",
+                  config,
+                  " new config:",
+                  fallbackConfig);
+
+        GraphEmitter<Attrs> graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name);
+
+        const auto& graphExecutor =
+            graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context)
+            .ensureAttrsMatch()
+            .ensureSrcDescsMatch()
+            .ensureDstDescsMatch()
+            .ensurePostOpsMatch()
+            .emit();
+        (void)graphExecutor;
+
+        OPENVINO_THROW("Fallback logic is not implemented yet");  // return graphExecutor;
+    }
+
 private:
     const MemoryDescArgs& descs;
     const Attrs& attrs;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp
new file mode 100644
index 00000000000000..8dfb7a4c63fde4
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp
@@ -0,0 +1,140 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "executor.hpp"
+#include "executor_config.hpp"
+#include "executor_implementation.hpp"
+#include "nodes/executors/graph_emitter.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+/**
+ * A stateful (variable) executor
+ * Contains two or more executors.
+ * Switches between the executors based on provided Memory (more precisely based on in / out shapes)
+ */
+template <typename Attrs>
+class VariableExecutor : public Executor {
+public:
+    using ExecutorImplementationRef = std::reference_wrapper<const ExecutorImplementation<Attrs>>;
+
+    VariableExecutor(const MemoryArgs& memory,
+                     const Attrs& attrs,
+                     const PostOps& postOps,
+                     const ExecutorContext::CPtr context,
+                     std::vector<ExecutorImplementationRef> suitableImplementations)
+        : m_attrs(attrs),
+          m_postOps(postOps),
+          m_context(context),
+          m_suitableImplementations(std::move(suitableImplementations)),
+          m_implementationRequiresFallback(
+              cacheFallbackStatus(m_suitableImplementations,
+                                  GraphEmitter<Attrs>::createConfig(memory, m_attrs, m_postOps))),
+          m_executors(m_suitableImplementations.size()) {
+        const size_t implId = select(memory, 0);
+        m_executors[implId] = create(implId, memory);
+        m_implId = implId;
+    }
+
+    bool update(const MemoryArgs& memory) override {
+        for (auto implId = select(memory, 0); implId < m_suitableImplementations.size();
+             implId = select(memory, implId)) {
+            if (!m_executors[implId]) {
+                m_executors[implId] = create(implId, memory);
+            }
+
+            if (m_executors[implId]->update(memory)) {
+                m_implId = implId;
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    void execute(const MemoryArgs& memory) override {
+        m_executors[m_implId]->execute(memory);
+    }
+
+    impl_desc_type implType() const override {
+        return m_executors[m_implId]->implType();
+    }
+
+    void moveMemToNumaNode(int numaID) override {
+        m_executors[m_implId]->moveMemToNumaNode(numaID);
+    }
+
+private:
+    /**
+     * @brief Returns a fallback status for each suitable implementation.
+     */
+    static std::vector<bool> cacheFallbackStatus(const std::vector<ExecutorImplementationRef>& suitableImplementations,
+                                                 const executor::Config<Attrs>& config) {
+        std::vector<bool> implementationRequiresFallback(suitableImplementations.size());
+        std::transform(suitableImplementations.begin(),
+                       suitableImplementations.end(),
+                       implementationRequiresFallback.begin(),
+                       [&config](const ExecutorImplementationRef& impl) {
+                           return impl.get().requiresFallback(config);
+                       });
+
+        return implementationRequiresFallback;
+    }
+
+    size_t select(const MemoryArgs& memory, const size_t startIdx) const {
+        OPENVINO_ASSERT(startIdx < m_suitableImplementations.size(),
+                        "Failed to find an implementation since start indx: ",
+                        startIdx,
+                        " is out of range of the suitable implementations array: ",
+                        m_suitableImplementations.size());
+
+        auto startIt = m_suitableImplementations.begin() + startIdx;
+
+        const auto selectedImplementation =
+            std::find_if(startIt,
+                         m_suitableImplementations.end(),
+                         [&memory](const ExecutorImplementationRef& implementation) {
+                             return implementation.get().shapeAgnostic() || implementation.get().acceptsShapes(memory);
+                         });
+
+        OPENVINO_ASSERT(selectedImplementation != m_suitableImplementations.end(), "Failed to select an implemetation");
+
+        return std::distance(m_suitableImplementations.begin(), selectedImplementation);
+    }
+
+    ExecutorPtr create(const size_t implId, const MemoryArgs& memory) {
+        assert(implId < m_executors.size() && implId < m_suitableImplementations.size());
+
+        auto createWithFallback = [this](const size_t implId, const MemoryArgs& memory) {
+            const auto& impl = m_suitableImplementations[implId].get();
+
+            if (m_implementationRequiresFallback[implId]) {
+                auto config = GraphEmitter<Attrs>::createConfig(memory, m_attrs, m_postOps);
+                if (auto fallbackConfig = impl.requiresFallback(config)) {
+                    return GraphEmitter<Attrs>::fallback(config, *fallbackConfig, memory, m_context, impl.name());
+                }
+            }
+
+            return impl.create(m_attrs, m_postOps, memory, m_context);
+        };
+
+        return createWithFallback(implId, memory);
+    }
+
+    const Attrs& m_attrs;
+    const PostOps& m_postOps;
+    const ExecutorContext::CPtr m_context;
+    std::vector<ExecutorImplementationRef> m_suitableImplementations;
+    // stores fallback status to avoid performing the check for every make() call
+    std::vector<bool> m_implementationRequiresFallback;
+    // executors cache
+    std::vector<ExecutorPtr> m_executors;
+    size_t m_implId;
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 307125ef0069e0..31ae4f26cc08a1 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -66,7 +66,7 @@ void FullyConnected::initTensorParallelConfig(const GraphContext::CPtr context)
             // init tp_cfg.w_rank and tp_cfg.w_size
             tp_cfg.w_rank = context->getCPUStreamExecutor()->get_rank()[0];
             tp_cfg.w_size = ov::threading::message_manager()->get_num_sub_streams();
-            tp_cfg.enable_tensor_parallel = tp_cfg.w_size > 1 ? true : false;
+            tp_cfg.enable_tensor_parallel = tp_cfg.w_size > 1;
             tp_cfg.sub_memory = context->getSubMemory();
         }
     }
@@ -119,16 +119,12 @@ void FullyConnected::needPrepareParamsForTensorParallel() {
     }
 }
 
-ExecutorPtr FullyConnected::createExecutor() {
-    const auto& executor = factory->make(memory);
-    getSelectedPrimitiveDescriptor()->setImplementationType(executor->implType());
-
-    return executor;
-}
-
 void FullyConnected::prepareParams() {
     needPrepareParamsForTensorParallel();
-    executor = createExecutor();
+
+    executor->update(memory);
+    // @todo avoid updating implementation type in scope of every prepareParams call
+    getSelectedPrimitiveDescriptor()->setImplementationType(executor->implType());
 }
 
 void FullyConnected::initTensorParallelSync() {
@@ -431,7 +427,7 @@ void FullyConnected::initSupportedPrimitiveDescriptors() {
     needUpdateZeroPointForTensorParallel();
 
     auto executionContext = std::make_shared<ExecutorContext>(context, getImplPriority(), privateWeightCache);
-    factory = std::make_shared<ExecutorFactory<FCAttrs, node::FullyConnected>>(attrs, postOps, executionContext, descs);
+    factory = std::make_shared<ExecutorFactory<FCAttrs>>(attrs, postOps, executionContext, descs);
     const auto nodeDescriptors = factory->getProperMemoryDescriptors(descs);
 
     NodeConfig nodeConfig;
@@ -496,7 +492,7 @@ void FullyConnected::createPrimitive() {
     needSplitMemoryForTensorParallel();
     // @todo should we preconfigure only for dynamic shapes?
     // Since for static shapes primitive is created in scope of compile_model() anyway
-    factory->preconfigure(memory);
+    executor = factory->make(memory);
 
     Node::createPrimitive();
 }
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index be29342b851988..8c17228e365af4 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -16,7 +16,6 @@
 #include "nodes/executors/memory_arguments.hpp"
 #include "nodes/executors/fullyconnected_config.hpp"
 #include "post_ops.hpp"
-#include "openvino/runtime/threading/cpu_message.hpp"
 
 namespace ov {
 namespace intel_cpu {
@@ -85,7 +84,6 @@ class FullyConnected : public Node {
     static const size_t WEIGHTS_ID = 1;
     static const size_t BIAS_ID = 2;
 
-    ExecutorPtr createExecutor();
     void fuseDecompressionConstant(const MemoryCPtr& memory, MemoryCPtr& decompressionValuesPtr);
 
     void initTensorParallelConfig(const GraphContext::CPtr context);
@@ -103,7 +101,7 @@ class FullyConnected : public Node {
     FCAttrs attrs;
     PostOps postOps;
     MemoryArgs memory;
-    ExecutorFactoryPtr<FCAttrs, node::FullyConnected> factory;
+    ExecutorFactoryPtr<FCAttrs> factory;
     ExecutorPtr executor = nullptr;
     std::string errorPrefix;
 

From fb1810b8ce36f7d8e7be26a0d5e71444f8c8f047 Mon Sep 17 00:00:00 2001
From: Alina Kladieva <alina.kladieva@intel.com>
Date: Fri, 6 Dec 2024 06:52:36 +0100
Subject: [PATCH 03/23] [tests/llm] Reorder imports to avoid onnx-related DDL
 load fail (#27942)

### Details:
There is an issue with ONNX>=1.17 which causes DLL load failures on
Windows. Previously it caused WWB import to fail (CVS-158774), it was
fixed in https://github.com/openvinotoolkit/openvino.genai/pull/1301.
Now this llm tests failure comes from the next import,
optimum.intel.openvino, and it doesn't reproduce locally if
optimum.intel.openvino is imported before WWB.

Signed-off-by: Alina Kladieva <alina.kladieva@intel.com>
---
 tests/llm/accuracy_conformance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/llm/accuracy_conformance.py b/tests/llm/accuracy_conformance.py
index 41015d7664ecc2..7f75a8e912bbd6 100644
--- a/tests/llm/accuracy_conformance.py
+++ b/tests/llm/accuracy_conformance.py
@@ -5,9 +5,9 @@
 import tempfile
 
 import pytest
-import whowhatbench as wwb
 from optimum.intel.openvino import (OVModelForCausalLM,
                                     OVWeightQuantizationConfig)
+import whowhatbench as wwb
 from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 
 logging.basicConfig(level=logging.INFO)

From 536bd69ed66a57869aa6d3bbe06692217997e67e Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Fri, 6 Dec 2024 11:50:30 +0400
Subject: [PATCH 04/23] [GPU] Parse runtime_options from model RT info and
 apply to config (#27900)

### Details:
- Added conversion logic from RT Info attributes to plugin property for
limited set of properties.

Signed-off-by: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
---
 .../intel_gpu/runtime/execution_config.hpp    | 14 +++
 src/plugins/intel_gpu/src/plugin/plugin.cpp   |  4 +
 .../src/runtime/execution_config.cpp          |  6 ++
 .../tests/functional/behavior/properties.cpp  | 99 +++++++++++++++++++
 4 files changed, 123 insertions(+)
 create mode 100644 src/plugins/intel_gpu/tests/functional/behavior/properties.cpp

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp
index 0af98bf1e952d0..3e854e4c9c5ada 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp
@@ -138,6 +138,10 @@ class ExecutionConfig {
 
     void apply_user_properties(const cldnn::device_info& info);
 
+    // Note that RT info property value has lower priority than values set by user via core.set_property or passed to compile_model call
+    // So this method should be called after setting all user properties, but before apply_user_properties() call.
+    void apply_rt_info(const ov::RTMap& rt_info);
+
     std::string to_string() const;
 
 protected:
@@ -147,6 +151,16 @@ class ExecutionConfig {
     void apply_priority_hints(const cldnn::device_info& info);
     void apply_debug_options(const cldnn::device_info& info);
 
+    template <typename T, PropertyMutability mutability>
+    void apply_rt_info_property(const ov::Property<T, mutability>& property, const ov::RTMap& rt_info) {
+        if (!is_set_by_user(property)) {
+            auto rt_info_val = rt_info.find(property.name());
+            if (rt_info_val != rt_info.end()) {
+                set_user_property(property(rt_info_val->second.template as<T>()));
+            }
+        }
+    }
+
 private:
     ov::AnyMap internal_properties;
     ov::AnyMap user_properties;
diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp
index 7775a153a99e8f..c8839472a6d962 100644
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -189,6 +189,8 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
 
     ExecutionConfig config = m_configs_map.at(device_id);
     config.set_user_property(orig_config);
+    if (model->has_rt_info("runtime_options"))
+        config.apply_rt_info(model->get_rt_info<ov::AnyMap>("runtime_options"));
     config.apply_user_properties(context->get_engine().get_device_info());
 
     set_cache_info(model, config);
@@ -278,6 +280,8 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr<const ov::Model>&
 
     ExecutionConfig config = m_configs_map.at(device_id);
     config.set_user_property(orig_config);
+    if (model->has_rt_info("runtime_options"))
+        config.apply_rt_info(model->get_rt_info<ov::AnyMap>("runtime_options"));
     config.apply_user_properties(ctx->get_engine().get_device_info());
 
     ProgramBuilder prog(ctx->get_engine(), config);
diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
index 4eaccf5540bd2a..30a9477e1600dd 100644
--- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp
+++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
@@ -257,6 +257,12 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
     user_properties.clear();
 }
 
+void ExecutionConfig::apply_rt_info(const ov::RTMap& rt_info) {
+    apply_rt_info_property(ov::hint::kv_cache_precision, rt_info);
+    apply_rt_info_property(ov::hint::dynamic_quantization_group_size, rt_info);
+    apply_rt_info_property(ov::hint::activations_scale_factor, rt_info);
+}
+
 std::string ExecutionConfig::to_string() const {
     std::stringstream s;
     s << "internal properties:\n";
diff --git a/src/plugins/intel_gpu/tests/functional/behavior/properties.cpp b/src/plugins/intel_gpu/tests/functional/behavior/properties.cpp
new file mode 100644
index 00000000000000..93a00262db35c2
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/functional/behavior/properties.cpp
@@ -0,0 +1,99 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/runtime/properties.hpp"
+#include "base/ov_behavior_test_utils.hpp"
+#include "openvino/runtime/core.hpp"
+#include "common_test_utils/subgraph_builders/conv_pool_relu.hpp"
+
+namespace {
+
+class TestPropertiesGPU : public ::testing::Test {
+public:
+    std::shared_ptr<ov::Model> model;
+
+    void SetUp() override {
+        SKIP_IF_CURRENT_TEST_IS_DISABLED();
+        model = ov::test::utils::make_conv_pool_relu();
+    }
+};
+
+TEST_F(TestPropertiesGPU, NoRTInfo) {
+    ov::Core core;
+    ov::Any type;
+    ov::Any size;
+    ov::Any scale;
+    ov::CompiledModel compiled_model;
+
+    OV_ASSERT_NO_THROW(compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU));
+    OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision));
+    OV_ASSERT_NO_THROW(size = compiled_model.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(scale = compiled_model.get_property(ov::hint::activations_scale_factor));
+}
+
+TEST_F(TestPropertiesGPU, RTInfoPropertiesWithDefault) {
+    ov::Core core;
+    ov::Any type;
+    ov::Any size;
+    ov::Any scale;
+    ov::CompiledModel compiled_model;
+    model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
+    model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    model->set_rt_info("8.0", "runtime_options", ov::hint::activations_scale_factor.name());
+
+    OV_ASSERT_NO_THROW(compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU));
+    OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision));
+    OV_ASSERT_NO_THROW(size = compiled_model.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(scale = compiled_model.get_property(ov::hint::activations_scale_factor));
+    ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f16);
+    ASSERT_EQ(size.as<uint64_t>(), 0);
+    ASSERT_EQ(scale.as<float>(), 8.0f);
+}
+
+TEST_F(TestPropertiesGPU, RTInfoPropertiesWithUserValuesFromCore) {
+    ov::Core core;
+    ov::Any type;
+    ov::Any size;
+    ov::Any scale;
+    ov::CompiledModel compiled_model;
+    model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
+    model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    model->set_rt_info("8.0", "runtime_options", ov::hint::activations_scale_factor.name());
+    core.set_property(ov::hint::kv_cache_precision(ov::element::u8));
+    core.set_property(ov::hint::dynamic_quantization_group_size(16));
+    core.set_property(ov::hint::activations_scale_factor(4.0f));
+
+    OV_ASSERT_NO_THROW(compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU));
+    OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision));
+    OV_ASSERT_NO_THROW(size = compiled_model.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(scale = compiled_model.get_property(ov::hint::activations_scale_factor));
+    ASSERT_EQ(type.as<ov::element::Type>(), ov::element::u8);
+    ASSERT_EQ(size.as<uint64_t>(), 16);
+    ASSERT_EQ(scale.as<float>(), 4.0f);
+}
+
+TEST_F(TestPropertiesGPU, RTInfoPropertiesWithUserValuesFromCompileModel) {
+    ov::Core core;
+    ov::Any type;
+    ov::Any size;
+    ov::Any scale;
+    ov::CompiledModel compiled_model;
+    model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
+    model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    model->set_rt_info("8.0", "runtime_options", ov::hint::activations_scale_factor.name());
+    ov::AnyMap config;
+    config[ov::hint::kv_cache_precision.name()] = "u8";
+    config[ov::hint::dynamic_quantization_group_size.name()] = "16";
+    config[ov::hint::activations_scale_factor.name()] = "4.0";
+
+    OV_ASSERT_NO_THROW(compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, config));
+    OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision));
+    OV_ASSERT_NO_THROW(size = compiled_model.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(scale = compiled_model.get_property(ov::hint::activations_scale_factor));
+    ASSERT_EQ(type.as<ov::element::Type>(), ov::element::u8);
+    ASSERT_EQ(size.as<uint64_t>(), 16);
+    ASSERT_EQ(scale.as<float>(), 4.0f);
+}
+
+} // namespace

From eed4a60be67dbb22825a4fad20245ae806e11634 Mon Sep 17 00:00:00 2001
From: Karol Blaszczak <karol.blaszczak@intel.com>
Date: Fri, 6 Dec 2024 10:15:28 +0100
Subject: [PATCH 05/23] [DOCS] test drive doc (#27933)

A document for test drive and some minor tweaks in other areas
---
 .../documentation/openvino-ecosystem.rst      |   9 ++
 .../openvino-test-drive.rst                   | 109 ++++++++++++++++++
 .../llm_inference_guide/genai-guide.rst       |   2 +-
 .../benchmarks_files/llm_models_7-155H.csv    |   1 +
 .../benchmarks_files/llm_models_7-258V.csv    |   1 +
 .../benchmarks_files/llm_models_9-288V.csv    |   3 +-
 .../_static/download/supported_models.csv     |   1 -
 7 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100644 docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst

diff --git a/docs/articles_en/documentation/openvino-ecosystem.rst b/docs/articles_en/documentation/openvino-ecosystem.rst
index 6735192e95f674..fe4f203428a865 100644
--- a/docs/articles_en/documentation/openvino-ecosystem.rst
+++ b/docs/articles_en/documentation/openvino-ecosystem.rst
@@ -12,6 +12,7 @@ OpenVINO™ Ecosystem Overview
    :hidden:
 
    openvino-ecosystem/openvino-training-extensions
+   openvino-ecosystem/openvino-test-drive
    openvino-ecosystem/datumaro
    openvino-ecosystem/openvino-security-add-on
 
@@ -102,6 +103,14 @@ development process, empowering teams to produce custom AI models at scale.
 |hr|
 
 
+| **Intel® Test Drive**
+| :bdg-link-dark:`Github <https://github.com/openvinotoolkit/openvino_testdrive>`
+
+OpenVINO™ Test Drive is cross-platform graphic user interface application that enables running
+generative AI and vision models directly on your computer or edge device using OpenVINO™ Runtime.
+|hr|
+
+
 | **Tokenizers**
 | :bdg-link-dark:`Github <https://github.com/openvinotoolkit/openvino_tokenizers>`
   :bdg-link-success:`User Guide <https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/ov-tokenizers.html>`
diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst
new file mode 100644
index 00000000000000..527a01bf38a6cf
--- /dev/null
+++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst
@@ -0,0 +1,109 @@
+===============================================================================================
+OpenVINO™ Test Drive
+===============================================================================================
+
+
+.. meta::
+   :description: See how to test your models with OpenVINO, using a simple graphic interface of
+                 Test Drive.
+
+
+
+OpenVINO™ Test Drive is a cross-platform graphic user interface application for running and
+testing AI models, both generative and vision based.
+It can run directly on your computer or on edge devices using
+`OpenVINO™ Runtime <https://github.com/openvinotoolkit/openvino>`__.
+
+OpenVINO™ Test Drive is developed under the `openvino_testdrive repository <https://github.com/openvinotoolkit/openvino_testdrive>`__.
+
+Use OpenVINO™ Test Drive to:
+
+* **Chat with LLMs** and evaluate model performance on your computer or edge device;
+* **Experiment with different text prompts** to generate images, using Stable
+  Diffusion and Stable DiffusionXL models (coming soon);
+* **Transcribe speech from video**, using Whisper models, including generation
+  of timestamps (coming soon);
+* **Run inference of models** trained by Intel® Geti™ and **visualize the results**.
+
+
+
+Installation (Windows)
+###############################################################################################
+
+1. Download the latest archive from the
+   `release repository <https://storage.openvinotoolkit.org/repositories/openvino_testdrive/packages>`__.
+   To verify the integrity of the downloaded package, use the SHA-256 file attached.
+
+2. Extract the zip file and run the *MSIX* installation package. Click the `Install` button to
+   proceed.
+
+3. Launch OpenVINO™ Test Drive, clicking the application name in the Windows app list.
+
+
+Quick start
+###############################################################################################
+
+When starting the application, you can import an LLM model from Hugging Face Hub
+or upload an Intel® Geti™ model from a local drive.
+
+Inference of models from Hugging Face
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+1. Find a model on `Hugging Face <https://huggingface.co/>`__ and import it.
+
+2. Chat with LLMs via the `Playground` tab.
+
+3. Use the `Performance metrics` tab to get model performance metrics on your
+   computer or an edge device.
+
+
+
+Inference of models trained with Intel® Geti™
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+1. Download the deployment code for a model in the OpenVINO IR format trained
+   by Intel® Geti™ (refer to the `Intel® Geti™ documentation <https://docs.geti.intel.com>`__
+   for more details).
+
+2. Import the deployment code into OpenVINO™ Test Drive, using the *Import model* and then
+   *Local disk* buttons.
+
+3. Use the *Live inference* tab to run and visualize results of inference of individual images.
+
+4. For batch inference, use the *Batch inference* tab and provide paths to the folder
+   with input images, as well as one for batch inference results. You can do so by filling out
+   the *Source folder* and *Destination folder* fields. Click *Start* to start batch inference.
+
+
+Build the Application
+###############################################################################################
+
+1. Make sure you `Install flutter SDK <https://docs.flutter.dev/get-started/install>`__
+   and all its platform-specific dependencies.
+2. Build the bindings and place them in the **./bindings** folder.
+
+   OpenVINO™ Test Drive uses bindings to `OpenVINO™ GenAI <https://github.com/openvinotoolkit/openvino.genai>`__
+   and `OpenVINO™ Model API <https://github.com/openvinotoolkit/model_api>`__,
+   which are located in the **./openvino_bindings** folder. Refer to the
+   `GitHub page <https://github.com/openvinotoolkit/openvino_testdrive/blob/main/openvino_bindings/>`__
+   for more details.
+
+3. Start the application, using the following command:
+
+   .. code-block:: console
+
+      flutter run
+
+Additional Resources
+###############################################################################################
+
+- `OpenVINO™ <https://github.com/openvinotoolkit/openvino>`__ - a software toolkit
+  for optimizing and deploying deep learning models.
+- `GenAI Repository <https://github.com/openvinotoolkit/openvino.genai>`__ and
+  `OpenVINO Tokenizers <https://github.com/openvinotoolkit/openvino_tokenizers>`__
+  - resources and tools for developing and optimizing Generative AI applications.
+- `Intel® Geti™ <https://docs.geti.intel.com/>`__ - software for building computer
+  vision models.
+- `OpenVINO™ Model API <https://github.com/openvinotoolkit/model_api>`__
+  - a set of wrapper classes for particular tasks and model architectures.
+  It simplifies routine procedures, preprocessing and postprocessing of data.
diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst
index 37b6091eb9b898..42c1c3fb47aa42 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst
@@ -18,7 +18,7 @@ make sure to :doc:`install OpenVINO with GenAI <../../get-started/install-openvi
 
 .. image:: ../../assets/images/genai_main_diagram.svg
    :align: center
-   :alt: OpenVINO workflow diagram for convenience
+   :alt: OpenVINO GenAI workflow diagram
 
 
 | Here is sample code for several Generative AI use case scenarios. Note that these are very basic
diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv
index fa5ae359fa45c0..9481b5619244e2 100644
--- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv
+++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv
@@ -1,3 +1,4 @@
+Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec
 opt-125m-gptq,INT4-MIXED,32,1116,25.8,8.1,123.5
 opt-125m-gptq,INT4-MIXED,1024,1187.1,75.2,8.2,122.0
 qwen2-0.5b,INT4-MIXED,32,1587.4,45.1,15.4,64.9
diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv
index 9aa769e4dd61b9..625ff1d6fe5ed5 100644
--- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv
+++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv
@@ -1,3 +1,4 @@
+Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec
 opt-125m-gptq,INT4-MIXED,32,1150.2,35.1,8.2,122.0
 opt-125m-gptq,INT4-MIXED,1024,1228,67,8.2,122.0
 qwen2-0.5b,INT4-MIXED,1024,1596.2,83.6,14.4,69.4
diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv
index dfc98271bcd21b..c1932e678505ff 100644
--- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv
+++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv
@@ -1,4 +1,5 @@
-﻿opt-125m-gptq,INT4-MIXED,32,833.1,15.6,3.9,256.4
+﻿Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec
+opt-125m-gptq,INT4-MIXED,32,833.1,15.6,3.9,256.4
 opt-125m-gptq,INT4-MIXED,1024,955.9,553.8,4.8,208.3
 bloomz-560m,INT4-MIXED,32,1457.5,48.5,11.1,90.1
 qwen2-0.5b,INT4-MIXED,32,1167.8,95.7,11.5,87.0
diff --git a/docs/sphinx_setup/_static/download/supported_models.csv b/docs/sphinx_setup/_static/download/supported_models.csv
index 87ea37b0f207c3..39053fa6d3e0a7 100644
--- a/docs/sphinx_setup/_static/download/supported_models.csv
+++ b/docs/sphinx_setup/_static/download/supported_models.csv
@@ -715,7 +715,6 @@ tiny-random-BeitForImageClassification,Image Classification,pytorch,intel-optimu
 tiny-random-bert,Natural Language Processing,pytorch,intel-optimum default,+,,
 tiny-random-BlenderbotModel,Large Language Model,pytorch,INT4,+,,
 tiny-random-BloomModel,Large Language Model,pytorch,INT4,+,,
-tiny-random-chatglm2,Large Language Model,pytorch,INT4,+,,
 tiny-random-codegen2,Large Language Model,pytorch,INT4,+,,
 tiny-random-CodeGenForCausalLM,Large Language Model,pytorch,INT4,+,,
 tiny-random-CohereForCausalLM,Large Language Model,pytorch,INT4,+,,

From 0f1e5092b518402248e372c2401651a0bd150f7f Mon Sep 17 00:00:00 2001
From: Andrzej Kopytko <andrzejx.kopytko@intel.com>
Date: Fri, 6 Dec 2024 11:41:12 +0100
Subject: [PATCH 06/23] [DOCS] Remove OVMS Button (#27951)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 docs/articles_en/about-openvino/performance-benchmarks.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst
index 5d9abfe891584f..a398432925a983 100644
--- a/docs/articles_en/about-openvino/performance-benchmarks.rst
+++ b/docs/articles_en/about-openvino/performance-benchmarks.rst
@@ -56,7 +56,8 @@ implemented in your solutions. Click the buttons below to see the chosen benchma
 
          :material-regular:`table_view;1.4em` LLM performance for AI PC
 
-   .. grid-item::
+.. uncomment under 
+   .. .. grid-item::
 
       .. button-link:: #
          :class: ovms-toolkit-benchmark-llm-result

From c3b014c49afa04a838e1778184cf97a1c834e465 Mon Sep 17 00:00:00 2001
From: Tomasz Jankowski <tomasz1.jankowski@intel.com>
Date: Fri, 6 Dec 2024 11:55:16 +0100
Subject: [PATCH 07/23] [Templ test] GroupNormalization: Enable whole Tensor
 comparison (#27932)

### Details:
 - Removed legacy comparison method.
 - Set relative threshold for fp16.

### Tickets:
 - CVS-137168

Signed-off-by: Tomasz Jankowski <tomasz1.jankowski@intel.com>
---
 .../tests/functional/op_reference/group_normalization.cpp    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/plugins/template/tests/functional/op_reference/group_normalization.cpp b/src/plugins/template/tests/functional/op_reference/group_normalization.cpp
index 322d509aa838ec..b3bd898db4eeec 100644
--- a/src/plugins/template/tests/functional/op_reference/group_normalization.cpp
+++ b/src/plugins/template/tests/functional/op_reference/group_normalization.cpp
@@ -42,11 +42,14 @@ class ReferenceGroupNormalization : public testing::TestWithParam<GroupNormaliza
                                     public CommonReferenceTest {
 public:
     void SetUp() override {
-        legacy_compare = true;
         const auto& params = GetParam();
         function = CreateFunction(params);
         inputData = {params.data_tensor.data, params.scale_tensor.data, params.bias_tensor.data};
         refOutData = {params.expected_tensor.data};
+
+        if (params.data_tensor.type == element::f16) {
+            threshold = 3e-2f;
+        }
     }
 
     static string getTestCaseName(const testing::TestParamInfo<GroupNormalizationParams>& obj) {

From bf62609711227605d381bedfcd993e6c60475975 Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Fri, 6 Dec 2024 23:51:58 +0900
Subject: [PATCH 08/23] [GPU] MLP : 2fcs + swiglu fusion  (#27831)

### Details:
 - 2 FCs + swiglu in MLP pattern are fused
 - Only applied to cldnn && #EUs > 128 && glu type with swiglu

### Tickets:
 - 152163
---
 .../intel_gpu/runtime/debug_configuration.hpp |   1 +
 .../include/intel_gpu/runtime/layout.hpp      |   5 +
 .../intel_gpu/src/graph/fully_connected.cpp   |  26 +++-
 .../prepare_primitive_fusing.cpp              |  62 +++++++++-
 .../src/graph/impls/ocl/fully_connected.cpp   |  14 ++-
 .../impls/ocl/kernel_selector_helper.cpp      |  10 +-
 .../src/graph/include/pass_manager.h          |   1 +
 .../intel_gpu/src/graph/include/swiglu_inst.h |   9 ++
 .../intel_gpu/src/graph/primitive_inst.cpp    |  11 ++
 .../intel_gpu/src/graph/program_node.cpp      |  22 ++++
 .../fully_connected_gpu_bf_tiled.cl           | 117 ++++++++++++++++--
 .../fully_connected_gpu_bf_tiled_common.cl    |  49 +++++++-
 .../fully_connected_kernel_bf_tiled.cpp       |  65 +++++++---
 .../fully_connected_kernel_bf_tiled.h         |   3 +-
 .../kernels/swiglu/swiglu_kernel_base.h       |  11 ++
 .../intel_gpu/src/plugin/ops/swiglu.cpp       |   4 +-
 .../transformations/fc_horizontal_fusion.cpp  |  19 ++-
 .../transformations/fc_horizontal_fusion.hpp  |   2 +-
 .../src/plugin/transformations_pipeline.cpp   |  10 +-
 .../src/runtime/debug_configuration.cpp       |   5 +-
 .../fusions/fully_connected_fusion_test.cpp   |  59 ++++++++-
 .../tests/unit/fusions/fusion_test_common.hpp |  12 ++
 22 files changed, 469 insertions(+), 48 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
index a020c5d1cd5ef6..a7a8ae1f229a72 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -147,6 +147,7 @@ class debug_configuration {
     int use_kv_cache_compression;                               // Enable KV-cache compression
     int dynamic_quantize_group_size;                            // Enable Dynamic quantization for fully connected primitive by specified group size
     int disable_horizontal_fc_fusion;                           // Disable fc horizontal fusion
+    int disable_fc_swiglu_fusion;                               // Disable swiglu fusion to fc
     std::set<int64_t> dump_iteration;                           // Dump n-th execution of network.
     std::vector<std::string> load_layers_raw_dump;              // List of layers to load dumped raw binary and filenames
     static const debug_configuration *get_instance();
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp
index ab5cb53454b768..cc753d10aea9cd 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp
@@ -50,6 +50,11 @@ struct data_type_traits {
         return et.is_quantized() && et.bitwidth() == 8;
     }
 
+    static bool is_i4_u4(data_types data_type) {
+        auto et = ov::element::Type(data_type);
+        return et.bitwidth() == 4;
+    }
+
     static ov::element::Type max_type(ov::element::Type t1, ov::element::Type t2) {
         if (t1.bitwidth() < t2.bitwidth())
             return t2;
diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp
index bc1e3e2e82b3ca..308d9a9f2fd66b 100644
--- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp
+++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp
@@ -7,8 +7,10 @@
 #include <string>
 #include <algorithm>
 #include "utils.hpp"
+#include "swiglu_inst.h"
 
 #include "matmul_shape_inference.hpp"
+#include "glu_shape_inference.hpp"
 
 namespace cldnn {
 GPU_DEFINE_PRIMITIVE_TYPE_ID(fully_connected)
@@ -171,14 +173,32 @@ std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_no
         output_type = impl_param.get_output_element_type();
     }
 
-    ov::op::v0::MatMul op;
-    op.set_transpose_b(true);
+    ov::op::v0::MatMul matmul_op;
+    matmul_op.set_transpose_b(true);
     std::vector<ShapeType> input_shapes = {
         input_layout.get<ShapeType>(),
         weights_layout.get<ShapeType>()
     };
 
-    std::vector<ShapeType> output_shapes = ov::op::v0::shape_infer(&op, input_shapes);
+    std::vector<ShapeType> output_shapes = ov::op::v0::shape_infer(&matmul_op, input_shapes);
+    bool has_swiglu = false;
+    auto& fused_prims = node.get_fused_primitives();
+    for (auto f : fused_prims) {
+        if (f.is_type<swiglu>()) {
+            has_swiglu = true;
+            OPENVINO_ASSERT(fused_prims.size() == 1, "Other operation is fused in addition to swiglu!");
+        }
+    }
+    if (has_swiglu) {
+        ov::op::internal::GLU swiglu_op;
+        OPENVINO_ASSERT(fused_prims.size() == 1);
+        OPENVINO_ASSERT(fused_prims[0].typed_desc<swiglu>()->glu_type == ov::op::internal::GLU::GluType::Swish);
+        swiglu_op.set_axis(fused_prims[0].typed_desc<swiglu>()->axis);
+        swiglu_op.set_split_lengths(fused_prims[0].typed_desc<swiglu>()->split_lengths);
+        swiglu_op.set_glu_type(fused_prims[0].typed_desc<swiglu>()->glu_type);
+        std::vector<ShapeType> input_shapes = { output_shapes[0] };
+        output_shapes = shape_infer(&swiglu_op, input_shapes);
+    }
 
     bool is_static = input_layout.is_static() && weights_layout.is_static();
     bool allow_new_shape_infer = impl_param.get_program().is_new_shape_infer();
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
index 60d1e8aa7e10b7..29b7cf58a19b54 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2018-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
+#include "intel_gpu/runtime/debug_configuration.hpp"
 #include "program_helpers.h"
 #include "pass_manager.h"
 
@@ -37,6 +37,7 @@
 #include "strided_slice_inst.h"
 #include "cum_sum_inst.h"
 #include "embedding_bag_inst.h"
+#include "swiglu_inst.h"
 #include "extract_image_patches_inst.h"
 #include "reduce_inst.h"
 #include "group_normalization_inst.h"
@@ -56,6 +57,7 @@ using namespace cldnn;
 void prepare_primitive_fusing::run(program& p) {
     fuse_reorders(p);
     remove_redundant_reshape(p);
+    fuse_swiglu(p);
     fuse_bias(p);
     fuse_simple_primitives(p);
     fuse_constant_transposes(p);
@@ -161,6 +163,46 @@ void prepare_primitive_fusing::fuse_reorders(program &p) {
     }
 }
 
+void prepare_primitive_fusing::fuse_swiglu(program &p) {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    bool disable_fc_swiglu_fusion = false;
+    GPU_DEBUG_IF(debug_config->disable_fc_swiglu_fusion == 1)
+        disable_fc_swiglu_fusion = true;
+    // Apply only for high performant GPU
+    if (disable_fc_swiglu_fusion || p.get_engine().get_device_info().execution_units_count < 128)
+        return;
+    // TODO: to support other glu types && other weight data types
+    auto itr = p.get_processing_order().begin();
+    std::map<primitive_id, std::vector<std::pair<primitive_id, size_t>>> fusing_history;
+    while (itr != p.get_processing_order().end()) {
+        auto node_itr = itr++;
+        auto& node = (*node_itr);
+        if (node->is_type<swiglu>()) {
+            if (!node->get_dependency(0).is_type<fully_connected>())
+                continue;
+            auto swiglu_prim = node->get_kernel_impl_params()->typed_desc<swiglu>();
+            auto& fc_node = node->get_dependency(0);
+            if (node->get_dependencies().size() > 1)
+                continue;
+            if (!node->get_dependency(0).get_fused_primitives().empty())
+                continue;
+            auto in_dt = fc_node.get_input_layout(0).data_type;
+            if (in_dt != data_types::f16)
+                continue;
+            auto wt_dt = fc_node.get_input_layout(1).data_type;
+            if (!data_type_traits::is_i4_u4(wt_dt))
+                continue;
+            if (swiglu_prim->glu_type != ov::op::internal::GLU::GluType::Swish ||
+               !(swiglu_prim->axis == -1 || swiglu_prim->axis == static_cast<int64_t>(node->get_output_layout(0).get_partial_shape().size()) - 1))
+                continue;
+            GPU_DEBUG_TRACE_DETAIL << node->id() << " : fuse swiglu to " << fc_node.id() << std::endl;
+            GPU_DEBUG_TRACE_DETAIL << " - split axis : " << swiglu_prim->axis << std::endl;
+            GPU_DEBUG_TRACE_DETAIL << " - split length : " << swiglu_prim->split_lengths << std::endl;
+            p.fuse_nodes(fc_node, *node, &fusing_history);
+        }
+    }
+}
+
 void prepare_primitive_fusing::fuse_bias(program &p) {
     auto itr = p.get_processing_order().begin();
     while (itr != p.get_processing_order().end()) {
@@ -188,6 +230,17 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
         if (!is_bias_add)
             continue;
 
+        for (auto& dep : eltw_node.get_dependencies()) {
+            auto& fused_prims = dep.first->get_fused_primitives();
+            if (std::any_of(fused_prims.begin(), fused_prims.end(), [](const fused_primitive_desc& f_desc) {
+                return f_desc.is_type<swiglu>();
+            })) {
+                GPU_DEBUG_TRACE_DETAIL << "Skip fusing " << eltw_node.id() << " to " << dep.first->id() << " because "
+                                       << dep.first->id() << " has fused swiglu." << std::endl;
+                continue;
+            }
+        }
+
         auto is_3d_fully_connected = [](program_node& node) {
             if (!node.is_type<fully_connected>())
                 return false;
@@ -491,6 +544,13 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
         };
 
         auto fc_supports_fusings = [&](fully_connected_node& node) -> bool {
+            auto& fused_prims = node.get_fused_primitives();
+            if (std::any_of(fused_prims.begin(), fused_prims.end(), [](const fused_primitive_desc& f_desc) {
+                    return f_desc.is_type<swiglu>();
+                })) {
+                GPU_DEBUG_TRACE_DETAIL << node.id() << " has fused swiglu. Skip fusing more primitives" << std::endl;
+                return false;
+            }
             if (lo.has_all_enabled_onednn_impls_optimization_attribute() &&
                 lo.get_preferred_impl_type(node, format::any /*dummy*/) == impl_types::onednn) {
                 return true;
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp
index 04f691c2bd2ca9..110444c2c6255c 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp
@@ -132,15 +132,16 @@ struct fully_connected_impl : typed_primitive_impl_ocl<fully_connected> {
             return layouts;
         };
 
-        auto get_fc_output_layout = [primitive](const std::vector<layout>& input_layouts, const layout& output_layout) {
+        auto get_fc_output_layout = [primitive](const std::vector<layout>& input_layouts, const layout& output_layout, bool swiglu_fused) {
             auto updated_out_layout = output_layout;
 
             auto input0_pshape = input_layouts[0].get_partial_shape();
             auto input1_pshape = input_layouts[1].get_partial_shape();
             ov::PartialShape updated_out_pshape {input0_pshape[0], input1_pshape[0]};
+            const auto output_feature_size = swiglu_fused ? input1_pshape[0] / 2 : input1_pshape[0];
 
             if (primitive->input_size == 3) {
-                updated_out_pshape = { input0_pshape[0], input0_pshape[1], input1_pshape[0] };
+                updated_out_pshape = { input0_pshape[0], input0_pshape[1], output_feature_size};
             }
             updated_out_layout.set_partial_shape(updated_out_pshape);
 
@@ -149,6 +150,13 @@ struct fully_connected_impl : typed_primitive_impl_ocl<fully_connected> {
 
         bool allow_new_shape_infer = impl_param.get_program().is_new_shape_infer();
         auto updated_impl_param = impl_param;
+        bool swiglu_fused = false;
+        if (updated_impl_param.fused_desc.size() > 0) {
+            for (const auto& f : updated_impl_param.fused_desc) {
+                if (f.is_type<swiglu>())
+                    swiglu_fused = true;
+            }
+        }
 
         const auto input_layouts = get_fc_input_layouts(impl_param.input_layouts, allow_new_shape_infer);
         for (size_t i = 0; i < input_layouts.size(); ++i) {
@@ -156,7 +164,7 @@ struct fully_connected_impl : typed_primitive_impl_ocl<fully_connected> {
         }
         updated_impl_param.weights_layout = input_layouts[1];
 
-        updated_impl_param.output_layouts[0] = get_fc_output_layout(input_layouts, impl_param.get_output_layout());
+        updated_impl_param.output_layouts[0] = get_fc_output_layout(input_layouts, impl_param.get_output_layout(), swiglu_fused);
 
         return updated_impl_param;
     }
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
index 0a999a5a124d3b..42d83a0265d290 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
@@ -32,11 +32,13 @@
 #include "intel_gpu/primitives/embedding_bag.hpp"
 #include "intel_gpu/primitives/extract_image_patches.hpp"
 
+#include "swiglu_inst.h"
 #include "activation_inst.h"
 #include "eltwise_inst.h"
 #include "quantize_inst.h"
 #include "reorder_inst.h"
 
+#include "kernel_selector/kernels/swiglu/swiglu_kernel_base.h"
 #include "kernel_selector/kernels/activation/activation_kernel_base.h"
 #include "kernel_selector/kernels/depth_to_space/depth_to_space_kernel_base.h"
 #include "kernel_selector/kernels/eltwise/eltwise_kernel_base.h"
@@ -1009,7 +1011,13 @@ kernel_selector::activation_function get_kernel_selector_activation_param(activa
 }
 
 std::shared_ptr<kernel_selector::fuse_params> convert_fuse_params(std::shared_ptr<NodeFuseParams> p) {
-    if (p->type() == activation::type_id()) {
+    if (p->type() == swiglu::type_id()) {
+        auto casted = std::dynamic_pointer_cast<SwigluFuseParams>(p);
+        auto axis = casted->_desc->axis;
+        auto split_length = casted->_desc->split_lengths;
+        auto split_to_glu_idx = casted->_desc->split_to_glu_idx;
+        return std::make_shared<kernel_selector::swiglu_fuse_params>(axis, split_length, split_to_glu_idx);
+    } else if (p->type() == activation::type_id()) {
         auto casted = std::dynamic_pointer_cast<ActivationFuseParams>(p);
         auto desc = casted->_desc;
         kernel_selector::base_activation_params p;
diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
index 61c34c0eff548f..490076a37f788e 100644
--- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h
+++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@@ -140,6 +140,7 @@ class prepare_primitive_fusing : public base_pass {
 private:
     void run(program& p) override;
     void fuse_bias(program &p);
+    void fuse_swiglu(program &p);
     void fuse_reorders(program& p);
     void fuse_simple_primitives(program &p);
     void fuse_constant_transposes(program &p);
diff --git a/src/plugins/intel_gpu/src/graph/include/swiglu_inst.h b/src/plugins/intel_gpu/src/graph/include/swiglu_inst.h
index 6a5ce08dc54bd2..755e9ab33c2db6 100644
--- a/src/plugins/intel_gpu/src/graph/include/swiglu_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/swiglu_inst.h
@@ -10,6 +10,11 @@
 
 namespace cldnn {
 
+class SwigluFuseParams : public NodeFuseParams {
+public:
+    SwigluFuseParams(std::shared_ptr<swiglu> desc) : NodeFuseParams(swiglu::type_id()), _desc(std::move(desc)) {}
+    std::shared_ptr<swiglu> _desc;
+};
 template <>
 struct typed_program_node<swiglu> : public typed_program_node_base<swiglu> {
     using parent = typed_program_node_base<swiglu>;
@@ -19,6 +24,10 @@ struct typed_program_node<swiglu> : public typed_program_node_base<swiglu> {
 
     program_node& input(size_t index = 0) const { return get_dependency(index); }
     std::vector<size_t> get_shape_infer_dependencies() const override { return {}; }
+
+    std::shared_ptr<NodeFuseParams> get_fuse_params() const override {
+        return std::make_shared<SwigluFuseParams>(typed_desc());
+    }
 };
 
 using swiglu_node = typed_program_node<swiglu>;
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index 5680eedcb8f87c..0737362405ff9c 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -38,6 +38,7 @@
 #include "gather_inst.h"
 #include "broadcast_inst.h"
 #include "dynamic_quantize_inst.h"
+#include "swiglu_inst.h"
 #include "experimental_detectron_roi_feature_extractor_inst.hpp"
 #include "impls/registry/implementation_manager.hpp"
 #include "impls/registry/registry.hpp"
@@ -2606,6 +2607,16 @@ bool primitive_inst::is_valid_fusion() const {
         } else {
             if (fd.is_type<reorder>() || fd.is_type<quantize>())
                 continue;
+            if (fd.is_type<swiglu>()) {
+                OPENVINO_ASSERT(_node->is_type<fully_connected>() && _node->get_preferred_impl_type() == impl_types::ocl);
+                if (!_node->get_selected_impl())
+                    return false;
+                // TODO : support ref kernel too
+                if (_node->get_selected_impl()->get_kernel_name().find("fully_connected_gpu_bf_tiled") != std::string::npos)
+                    return true;
+                else
+                    return false;
+            }
 
             OPENVINO_THROW("[GPU] Unsupported fused operation in dynamic shape: type=", fd.desc->type_string(), ", id=", fd.desc->id);
         }
diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp
index 201fa3a155caa9..5161887b79e57a 100644
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@@ -10,6 +10,7 @@
 #include "activation_inst.h"
 #include "reorder_inst.h"
 #include "quantize_inst.h"
+#include "swiglu_inst.h"
 #include "intel_gpu/runtime/debug_configuration.hpp"
 #ifdef ENABLE_ONEDNN_FOR_GPU
 #include "convolution_inst.h"
@@ -770,6 +771,15 @@ void program_node::save(cldnn::BinaryOutputBuffer& ob) const {
                 ob << casted->_out_hi;
                 ob << casted->_out_scale;
                 ob << casted->_out_shift;
+            } else if (f_desc.f_param->type() == swiglu::type_id()) {
+                auto casted = std::dynamic_pointer_cast<SwigluFuseParams>(f_desc.f_param);
+                if (get_program().has_node(casted->_desc->id)) {
+                    ob << true;
+                    ob << casted->_desc->id;
+                } else {
+                    ob << false;
+                    ob << casted->_desc;
+                }
             }
 
             ob << f_desc.deps.size();
@@ -975,6 +985,18 @@ void program_node::load(cldnn::BinaryInputBuffer& ib) {
                                     need_pre_shift, need_clamp, need_min_clamp, need_max_clamp, per_tensor_input_range,
                                     per_tensor_input_scale, per_tensor_input_shift, per_tensor_output_range, per_tensor_output_scale,
                                     per_tensor_output_shift, in_lo, in_hi, in_scale, in_shift, out_lo, out_hi, out_scale, out_shift);
+            } else if (f_param_type == swiglu::type_id()) {
+                ib >> exist_prim;
+                std::shared_ptr<swiglu> param_desc;
+                if (exist_prim) {
+                    primitive_id desc_id;
+                    ib >> desc_id;
+                    param_desc = std::dynamic_pointer_cast<swiglu>(get_program().get_node_ptr(desc_id)->desc);
+                } else {
+                    ib >> param_desc;
+                }
+                f_desc.f_param = std::make_shared<SwigluFuseParams>(param_desc);
+
             } else {
                 f_desc.f_param = std::make_shared<NodeFuseParams>(f_param_type);
             }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
index 201b59c160cf27..01c8e8853e350d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -95,6 +95,12 @@ KERNEL(quantize_input)(
 #       error "fully_connected_gpu_bf_tiled.cl - TILE_K must be one of {1, 2, 4}"
 #   endif
 #endif
+
+#ifdef SWIGLU_LENGTH
+#   if OUTER_OFM != 2
+#       error "fully_connected_gpu_bf_tiled.cl - outer_ofm should be 2 when swiglu is fused"
+#   endif
+#endif
 #if TILE_K == 4 && COMPRESSED_WEIGHTS_INT4 && FILTER_LAYOUT_OS_IS_YX_OSV32_ISV2
     // Data stored in memory : f0k0k1|f16k0k1|f0k2k3|f16k2k3
     // => unpack as f0k0k1|f0k2k3|f16k0k1|f16k2k3 so that the weight access order is preserved
@@ -210,14 +216,27 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
     // full dispatch pipeline.
     uint feature_mini_block = gid % DISPATCH_FSV;
     uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV;
+    #ifdef SWIGLU_LENGTH
+    uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
+    uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
+    #else
     uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV);
     uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV);
+    #endif
 
 #if USE_SLM
+    #ifdef SWIGLU_LENGTH
+    uint out_f = gid * (TILE_OFM * SIMD);
+    #else
     uint out_f = gid * (OUTER_OFM * TILE_OFM * SIMD);
+    #endif
     uint out_b = LWS_BATCHES * TILE_B * (uint)get_group_id(2) + local_id * TILE_B;
 #else
+    #ifdef SWIGLU_LENGTH
+    uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (TILE_OFM * SIMD);
+    #else
     uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (OUTER_OFM * TILE_OFM * SIMD);
+    #endif
     uint out_b = ((batch_mega_block * DISPATCH_BSV + batch_mini_block) * TILE_B);
 #endif
 
@@ -299,9 +318,20 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
     ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp);
 #endif
 
+    ACTIVATION_VEC_TYPE activated[TILE_B] = { };
 #if OUTER_OFM > 1
     uint input_offset_init = input_offset;
-    unroll_for (uint oi = 0; oi < OUTER_OFM; ++oi) {
+    uint weights_offset_init = weights_offset;
+    uint out_f_init = out_f;
+    __attribute__((opencl_unroll_hint(1)))
+    for (uint oi = 0; oi < OUTER_OFM; ++oi) {
+        input_offset = input_offset_init;
+        #ifdef SWIGLU_LENGTH
+        weights_offset = weights_offset_init + oi * (FILTER_IFM_NUM / (TILE_K_OFM / TILE_K_OFM_PACKED) ) * SWIGLU_LENGTH;
+        out_f += SWIGLU_LENGTH * oi;
+        #else
+        out_f += TILE_OFM * SIMD * oi;
+        #endif
 #endif
 
 #if REALIGN_FP16_OFFSET
@@ -669,14 +699,38 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
 #endif // MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD) != 0
     // =====================================================================================================================================
     // Post-processing: bias, activation, fused-ops
-    ACTIVATION_VEC_TYPE activated[TILE_B] = { };
-    for (uint bi = 0; bi < TILE_B; ++bi) {
+    unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
+        #ifdef SWIGLU_LENGTH
+        #if SWIGLU_SPLIT_TO_GLU_IDX == 0
+        if (oi == 0) {
+            // swish
+            activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]);
+            activated[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * activated[bi])));
+        } else {
+            activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        }
+        #else
+        if (oi == 0) {
+            // swish
+            activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        } else {
+            acc[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * acc[bi])));
+            activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        }
+        #endif
+        #else
         activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        #endif
 #if OUTER_OFM > 1
         acc[bi] = 0;
 #endif
     }
 
+#if OUTER_OFM > 1 && defined(SWIGLU_LENGTH)
+    }
+    out_f = out_f_init;
+#endif
+
 #if BIAS_TERM
     #if TILE_OUT_F_NUM % (OUTER_OFM * TILE_OFM * SIMD) == 0
         BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f);
@@ -746,9 +800,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
             output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD;
         }
     }
-#if OUTER_OFM > 1
-    out_f += TILE_OFM * SIMD;
-    input_offset = input_offset_init;
+#if OUTER_OFM > 1 && !defined(SWIGLU_LENGTH)
     }
 #endif
     // =====================================================================================================================================
@@ -816,8 +868,14 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
     // full dispatch pipeline.
     uint feature_mini_block = gid % DISPATCH_FSV;
     uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV;
+    #ifdef SWIGLU_LENGTH
     uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
     uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
+    #else
+    uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV);
+    uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV);
+    #endif
+
 
     FILTER_VEC_TYPE wei = 0;
 
@@ -895,6 +953,22 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
         ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp);
     #endif
 
+    ACTIVATION_VEC_TYPE activated[TILE_B] = { };
+#if OUTER_OFM > 1
+    uint input_offset_init = input_offset;
+    uint weights_offset_init = weights_offset;
+    uint out_f_init = out_f;
+    __attribute__((opencl_unroll_hint(1)))
+    for (uint oi = 0; oi < OUTER_OFM; ++oi) {
+        input_offset = input_offset_init;
+        #ifdef SWIGLU_LENGTH
+        weights_offset = weights_offset_init + oi * (FILTER_IFM_NUM / (TILE_K_OFM / TILE_K_OFM_PACKED) ) * SWIGLU_LENGTH;
+        out_f += SWIGLU_LENGTH * oi;
+        #else
+        out_f += TILE_OFM * SIMD * oi;
+        #endif
+#endif
+
     // =====================================================================================================================================
     // Main computation loop
     const uint iterations = MAIN_LOOP_ELEMENTS_COUNT / TILE_IFM_ELEMENTS_SIZE;  // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD)
@@ -1164,11 +1238,37 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
 
     // =====================================================================================================================================
     // Post-processing: bias, activation, fused-ops
-    ACTIVATION_VEC_TYPE activated[TILE_B] = { };
     for (uint bi = 0; bi < TILE_B; ++bi) {
+        #ifdef SWIGLU_LENGTH
+        #if SWIGLU_SPLIT_TO_GLU_IDX == 0
+        if (oi == 0) {
+            activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]);
+            activated[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * activated[bi])));
+        } else {
+            activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        }
+        #else
+        if (oi == 0) {
+            // swish
+            activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        } else {
+            acc[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * acc[bi])));
+            activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        }
+        #endif
+        #else
         activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        #endif
+#if OUTER_OFM > 1
+        acc[bi] = 0;
+#endif
     }
 
+#if OUTER_OFM > 1 && defined(SWIGLU_LENGTH)
+    }
+    out_f = out_f_init;
+#endif
+
 #if BIAS_TERM
     #if TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0
         BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f);
@@ -1240,6 +1340,9 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
             output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD;
         }
     }
+#if OUTER_OFM > 1 && !defined(SWIGLU_LENGTH)
+    }
+#endif
     // =====================================================================================================================================
 }
 #endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl
index ddffa87b202816..ca5c1ea3646d02 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl
@@ -25,7 +25,6 @@ inline void (FUNC_NAME)(
 ) {
     uint gid = (uint)get_group_id(0);
     uint sglid = (uint)get_sub_group_local_id();
-
     // Dispatch as bs_fs_bsv_fsv, where bsv = DISPATCH_BSV and fsv = DISPATCH_FSV.
     // This allows more fine grained control over dispatch order than using work-groups and
     // avoids requirement of threads being available for whole work-group.
@@ -33,10 +32,19 @@ inline void (FUNC_NAME)(
     // full dispatch pipeline.
     uint feature_mini_block = gid % DISPATCH_FSV;
     uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV;
+    #ifdef SWIGLU_LENGTH
+    uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
+    uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
+    #else
     uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV);
     uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, OUTER_OFM * TILE_OFM * SIMD) / DISPATCH_FSV);
+    #endif
 
+    #ifdef SWIGLU_LENGTH
+    uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (TILE_OFM * SIMD);
+    #else
     uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (OUTER_OFM * TILE_OFM * SIMD);
+    #endif
     uint out_b = ((batch_mega_block * DISPATCH_BSV + batch_mini_block) * FORCED_TILE_B);
 
     ACCUMULATOR_VEC_TYPE acc[FORCED_TILE_B] = { };
@@ -90,9 +98,19 @@ inline void (FUNC_NAME)(
     ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp);
 #endif
 
+    ACTIVATION_VEC_TYPE activated[FORCED_TILE_B] = { };
 #if OUTER_OFM > 1
     uint input_offset_init = input_offset;
+    uint weights_offset_init = weights_offset;
+    uint out_f_init = out_f;
     unroll_for (uint oi = 0; oi < OUTER_OFM; ++oi) {
+        input_offset = input_offset_init;
+        #ifdef SWIGLU_LENGTH
+        weights_offset = weights_offset_init + oi * (FILTER_IFM_NUM / (TILE_K_OFM / TILE_K_OFM_PACKED) ) * SWIGLU_LENGTH;
+        out_f += SWIGLU_LENGTH * oi;
+        #else
+        out_f += TILE_OFM * SIMD * oi;
+        #endif
 #endif
 
 #if REALIGN_FP16_OFFSET
@@ -297,14 +315,37 @@ inline void (FUNC_NAME)(
 #endif // MAIN_LOOP_ELEMENTS_COUNT % (TILE_IFM * SIMD) != 0
     // =====================================================================================================================================
     // Post-processing: bias, activation, fused-ops
-    ACTIVATION_VEC_TYPE activated[FORCED_TILE_B] = { };
     for (uint bi = 0; bi < FORCED_TILE_B; ++bi) {
+        #ifdef SWIGLU_LENGTH
+        #if SWIGLU_SPLIT_TO_GLU_IDX == 0
+        if (oi == 0) {
+            activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]);
+            activated[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * activated[bi])));
+        } else {
+            activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        }
+        #else
+        if (oi == 0) {
+            // swish
+            activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        } else {
+            acc[bi] /= (ACCUMULATOR_VAL_ONE + native_exp(-(ACCUMULATOR_VAL_ONE * acc[bi])));
+            activated[bi] *= TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        }
+        #endif
+        #else
         activated[bi] = TO_ACTIVATION_VEC_TYPE(acc[bi]);
+        #endif
 #if OUTER_OFM > 1
         acc[bi] = 0;
 #endif
     }
 
+#if OUTER_OFM > 1 && defined(SWIGLU_LENGTH)
+    }
+    out_f = out_f_init;
+#endif
+
 #if BIAS_TERM
     #if TILE_OUT_F_NUM % (OUTER_OFM * TILE_OFM * SIMD) == 0
         BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f);
@@ -396,9 +437,7 @@ inline void (FUNC_NAME)(
             output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD;
         }
     }
-#if OUTER_OFM > 1
-    out_f += TILE_OFM * SIMD;
-    input_offset = input_offset_init;
+#if OUTER_OFM > 1 && !defined(SWIGLU_LENGTH)
     }
 #endif
     // =====================================================================================================================================
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
index 02304512637783..46e8f7f1104f0d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -4,6 +4,7 @@
 
 #include "fully_connected_kernel_bf_tiled.h"
 #include "kernel_selector_utils.h"
+#include "swiglu/swiglu_kernel_base.h"
 #include <vector>
 #include <functional>
 #include "common_types.h"
@@ -163,7 +164,21 @@ static bool is_weight_small_kn(const fully_connected_params& params, size_t outp
     return output_f / 2 /*most frequently used tile_ofm*/ <= min_num_threads;
 }
 
+static bool is_swiglu_fused(const fully_connected_params& params) {
+    bool swiglu_fused = false;
+    if (!params.fused_ops.empty()) {
+        for (auto p : params.fused_ops) {
+            if (p.GetType() == kernel_selector::KernelType::SWIGLU)
+                swiglu_fused = true;
+        }
+    }
+    if (swiglu_fused)
+        OPENVINO_ASSERT(params.fused_ops.size() == 1);
+    return swiglu_fused;
+}
 static bool is_suitable_outer_ofm(const fully_connected_params& params, size_t output_f) {
+    if (is_swiglu_fused(params))
+        return true;
     size_t min_num_threads = params.engineInfo.computeUnitsCount * simd;
     return (params.weights.OFM().v > params.weights.IFM().v * 6
             && output_f / 8 /* tile_ofm=4 and outer_ofm=2 */ > min_num_threads * 1.5);
@@ -406,6 +421,8 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,
     while (max_tile_ofm * 2 * simd <= output_f && max_tile_ofm < 4)
         max_tile_ofm *= 2;
 
+    bool swiglu_fused = is_swiglu_fused(params);
+
     if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4 ||
         (is_weight_dyn_quantizable(params) && should_dynamic_quantize(params))) {
         // Only 4bit weight type is fully optimized to use SLM. In default kernel, SLM is not applied to 8bit weight.
@@ -426,30 +443,39 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,
                 if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) {
                     return selector.Default(tune_params(1, 1, 4, 4, 1, 1, 1, EXE_MODE_DEFAULT));
                 } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) {
-                    selector.Case(tune_params(1, 4, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT))
-                            .Case(tune_params(1, 4, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
+                    // Here : b1 static
+                    if (swiglu_fused) {
+                        return selector.Default(tune_params(1, 4, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT));
+                    } else {
+                        selector.Case(tune_params(1, 4, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT))
+                                .Case(tune_params(1, 4, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
+                    }
                 } else {
-                    return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
+                    if (swiglu_fused) {
+                        return selector.Default(tune_params(1, 2, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT));
+                    } else {
+                        return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
+                    }
                 }
             }
         } else {
             // Try to use SLM kernels if possible
+            unsigned int forced_outer_ofm = swiglu_fused ? 2 : 1;
             if (preferred_kernel_type != KernelType::DEFAULT) {
                 if (params.is_shape_agnostic && !should_dynamic_quantize(params)) {
-                    selector.Case(tune_params(16, 2, 2, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM))
-                            .Case(tune_params(16, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM));
+                    selector.Case(tune_params(16, 2, 2, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM))
+                            .Case(tune_params(16, 2, 1, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM));
                 }
-
-                selector.Case(tune_params(8, 2, 2, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM))
-                        .Case(tune_params(8, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM));
+                selector.Case(tune_params(8, 2, 2, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM))
+                        .Case(tune_params(8, 2, 1, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM));
             }
 
             if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16)
-                return selector.Default(tune_params(8, 1, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT));
+                return selector.Default(tune_params(8, 1, 1, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT));
             else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)
-                return selector.Default(tune_params(8, 4, 1, 2, 1, 1, 1, EXE_MODE_DEFAULT));
+                return selector.Default(tune_params(8, 4, 1, 2, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT));
             else
-                return selector.Default(tune_params(8, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT));
+                return selector.Default(tune_params(8, 2, 1, 4, forced_outer_ofm, 1, 1, EXE_MODE_DEFAULT));
         }
     } else if (params.compressed && params.engineInfo.supports_immad) {
         return selector.Default(tune_params(1, 1, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT));
@@ -526,8 +552,12 @@ FullyConnected_bf_tiled::SetDefault(const fully_connected_params& params, int au
         kernel_type = kernel_number == 0 ? KernelType::DEFAULT : KernelType::SLM;
 
     auto tparams = GetAutoTuneParams(params, kernel_type, autoTuneIndex);
+    std::pair<size_t, size_t> threads;
+    if (is_swiglu_fused(params))
+        threads = get_output_aligned_bf_size(params, true, tparams.tile_b, tparams.tile_ofm * simd);
+    else
+        threads = get_output_aligned_bf_size(params, true, tparams.tile_b, tparams.tile_ofm * tparams.outer_ofm * simd);
 
-    auto threads = get_output_aligned_bf_size(params, true, tparams.tile_b, tparams.tile_ofm * tparams.outer_ofm * simd);
     auto batch_threads = threads.first;
     auto feature_threads = threads.second;
 
@@ -575,6 +605,13 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
     size_t tile_k_ofm_packed = tile_k_ofm;
     size_t quantize_grp_size = get_dynamic_quantize_group_size(params);
 
+    if (is_swiglu_fused(params)) {
+        auto split_length = params.fused_ops[0].GetOpParams<swiglu_fuse_params>()->split_length;
+        auto split_to_glu_idx = params.fused_ops[0].GetOpParams<swiglu_fuse_params>()->split_to_glu_idx;
+        jit.AddConstant(MakeJitConstant("SWIGLU_LENGTH", split_length));
+        jit.AddConstant(MakeJitConstant("SWIGLU_SPLIT_TO_GLU_IDX", split_to_glu_idx));
+    }
+
     bool add_decompress_scale_post_op = false;
     WeightsType weights_dt = params.weights.GetDType();
     if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) {
@@ -723,7 +760,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM)"));
     }
 
-    if (!params.fused_ops.empty()) {
+    if (!params.fused_ops.empty() && !is_swiglu_fused(params)) {
         std::vector<std::string> idx_order_scalar = { "(out_b + bi)", "(out_f + sglid)", "0", "0" };
         std::vector<std::string> idx_order_vec = { "(out_b + bi)", "(out_f + sglid + fi * SIMD)", "0", "0" };
         if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
@@ -828,7 +865,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
     auto output_f = get_output_aligned_bf_size(fc_params, false).second;
 
     WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16;
-    if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
+    if (!is_swiglu_fused(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
         && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)
         && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4)
         && is_weight_horizontal(fc_params, output_f)) {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h
index cbbf52adf344ce..1093c7377bf76f 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h
@@ -76,7 +76,8 @@ class FullyConnected_bf_tiled : public FullyConnectedKernelBase {
     std::vector<FusedOpType> GetSupportedFusedOps() const override {
         return { FusedOpType::ACTIVATION,
                  FusedOpType::ELTWISE,
-                 FusedOpType::QUANTIZE };
+                 FusedOpType::QUANTIZE,
+                 FusedOpType::SWIGLU };
     }
     JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
     bool Validate(const Params& params) const override;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/swiglu/swiglu_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/swiglu/swiglu_kernel_base.h
index 2f5c046690f78d..bb5625ba087a2d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/swiglu/swiglu_kernel_base.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/swiglu/swiglu_kernel_base.h
@@ -21,6 +21,17 @@ struct swiglu_params : public base_params {
     int32_t split_to_glu_idx;
 };
 
+struct swiglu_fuse_params : fuse_params {
+    explicit swiglu_fuse_params(int32_t axis, size_t split_lengths, size_t split_to_glu_idx)
+        : fuse_params(KernelType::SWIGLU),
+            axis(axis),
+            split_length(split_lengths),
+            split_to_glu_idx(split_to_glu_idx) {}
+    int32_t axis;
+    size_t split_length;
+    size_t split_to_glu_idx;
+};
+
 class SwiGLUKernelBase : public KernelBaseOpenCL {
 public:
     using KernelBaseOpenCL::KernelBaseOpenCL;
diff --git a/src/plugins/intel_gpu/src/plugin/ops/swiglu.cpp b/src/plugins/intel_gpu/src/plugin/ops/swiglu.cpp
index 5df2cafd41a41f..23b44dcc1a4677 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/swiglu.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/swiglu.cpp
@@ -21,7 +21,7 @@ static void CreateGLUOp(ProgramBuilder& p, const std::shared_ptr<GLU>& op) {
     if (p.use_new_shape_infer()) {
         auto prim = cldnn::swiglu(primitive_name,
                                   inputs[0],
-                                  op->get_axis(),
+                                  (op->get_axis() < 0 ? op->get_input_partial_shape(0).size() + op->get_axis() : op->get_axis()),
                                   op->get_split_lengths(),
                                   op->get_glu_type(),
                                   op->get_split_to_glu_idx(),
@@ -31,7 +31,7 @@ static void CreateGLUOp(ProgramBuilder& p, const std::shared_ptr<GLU>& op) {
     } else {
         auto prim = cldnn::swiglu(primitive_name,
                                   inputs[0],
-                                  op->get_axis(),
+                                  (op->get_axis() < 0 ? op->get_input_partial_shape(0).size() + op->get_axis() : op->get_axis()),
                                   op->get_split_lengths(),
                                   op->get_glu_type(),
                                   op->get_split_to_glu_idx(),
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp
index fcb339531c1883..327de1424c34c9 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp
@@ -18,16 +18,25 @@
 namespace ov {
 namespace intel_gpu {
 
-FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
+FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swiglu) {
     using namespace ov::pass::pattern;
 
-    auto is_target_pattern = [](const Output<Node>& output) {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    // Three FCs connected to the same input
+    size_t min_num_fcs_to_fuse = 3;
+    // Note:
+    // For cldnn, two fcs in mlp will be fused at horizontal fc fusion, and then swiglu will be fused at prepare_primitive_fusion
+    // i.e., eltwise((fc + swish), fc) => fused_fc + swiglu => fused_fc_swilgu
+    // Onednn gemms are to be handled in a different way (TBD)
+    if (fuse_mlp_swiglu)
+        min_num_fcs_to_fuse = 2;
+    auto is_target_pattern = [min_num_fcs_to_fuse](const Output<Node>& output) {
+        const int max_num_fcs_to_fuse = 3;
         // Currently this pass targets only compressed FCs (QKV) on dynamic generative models
         // inputs: input, weight, bias, scale, [zp]
         // Bias/scale/zp are constant or none
         // if it is not constant, the only allowed cases are Constant => convert
         // All FCs have same # of valid inputs (e.g., if one of the fc has zp, all fcs have zp)
-
         auto is_constant = [](const std::shared_ptr<ov::Node> node) {
             if (std::dynamic_pointer_cast<ov::op::v0::Constant>(node))
                 return true;
@@ -40,9 +49,7 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion() {
         auto is_placeholder = [](const std::shared_ptr<ov::Node> node) {
             return std::dynamic_pointer_cast<op::Placeholder>(node);
         };
-        // Three FCs connected to the same input
-        const int min_num_fcs_to_fuse = 3;
-        const int max_num_fcs_to_fuse = 3;
+
         const auto& fc = std::dynamic_pointer_cast<op::FullyConnectedCompressed>(output.get_node_shared_ptr());
         const auto& input = fc->get_input_node_shared_ptr(0);
         if (!fc->get_input_partial_shape(0).is_dynamic())
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.hpp
index b6a852354bad8d..67abaa3df54357 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.hpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.hpp
@@ -12,7 +12,7 @@ namespace intel_gpu {
 class FullyConnectedHorizontalFusion: public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("FullyConnectedHorizontalFusion", "0");
-    FullyConnectedHorizontalFusion();
+    FullyConnectedHorizontalFusion(bool fuse_mlp_swiglu = false);
 };
 
 }   // namespace intel_gpu
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index fcb88560944854..e47ccbb09a9c43 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -913,12 +913,18 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>();
 
         bool disable_horizontal_fc_fusion = false;
+        bool disable_fc_swiglu_fusion = false;
         GPU_DEBUG_GET_INSTANCE(debug_config);
         GPU_DEBUG_IF(debug_config->disable_horizontal_fc_fusion == 1)
             disable_horizontal_fc_fusion = true;
-
+        GPU_DEBUG_IF(debug_config->disable_fc_swiglu_fusion == 1)
+            disable_fc_swiglu_fusion = true;
+        // mlp fusion is only supported for cldnn on high performant GPUis
+        bool fuse_mlp_swiglu = !device_info.supports_immad &&
+                               device_info.execution_units_count >= 128 &&
+                               !disable_fc_swiglu_fusion;
         if (!disable_horizontal_fc_fusion)
-            manager.register_pass<ov::intel_gpu::FullyConnectedHorizontalFusion>();
+            manager.register_pass<ov::intel_gpu::FullyConnectedHorizontalFusion>(fuse_mlp_swiglu);
 
         // ZP should not be folded for FC. But still, ZP should be folded for Gather.
         // Therefore, run MarkDequantizationSubgraph again to fold ZP constant.
diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
index 4a68355e1bc8ba..65ca31f16c720c 100644
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@@ -191,6 +191,7 @@ static void print_help_messages() {
     message_list.emplace_back("OV_GPU_DynamicQuantizeGroupSize", "Specify a group size of dynamic quantization to enable "
                               "dynamic quantization for Fully-connected primitive.");
     message_list.emplace_back("OV_GPU_DisableHorizontalFCFusion", "Disable horizontal fc fusion");
+    message_list.emplace_back("OV_GPU_DisableFCSwigluFusion", "Disable fc + swiglu fusion");
     message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space.");
     message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in "
                               "the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), "
@@ -259,7 +260,8 @@ debug_configuration::debug_configuration()
         , use_usm_host(0)
         , use_kv_cache_compression(-1)
         , dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET)
-        , disable_horizontal_fc_fusion(0) {
+        , disable_horizontal_fc_fusion(0)
+        , disable_fc_swiglu_fusion(0) {
 #ifdef GPU_DEBUG_CONFIG
     get_gpu_debug_env_var("Help", help);
     get_common_debug_env_var("Verbose", verbose);
@@ -314,6 +316,7 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression);
     get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size);
     get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion);
+    get_gpu_debug_env_var("DisableFCSwigluFusion", disable_fc_swiglu_fusion);
     std::string dump_iteration_str;
     get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
     std::string mem_preallocation_params_str;
diff --git a/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp
index 5e9b5134fb3802..09e164742f3fd9 100644
--- a/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp
@@ -9,6 +9,7 @@
 #include <intel_gpu/primitives/quantize.hpp>
 #include <intel_gpu/primitives/eltwise.hpp>
 #include <intel_gpu/primitives/fully_connected.hpp>
+#include <intel_gpu/primitives/swiglu.hpp>
 #include <intel_gpu/primitives/data.hpp>
 #include <intel_gpu/primitives/crop.hpp>
 
@@ -73,7 +74,7 @@ class FullyConnectedFusingTest : public ::BaseFusingTest<fully_connected_test_pa
     }
 
     layout get_scale_layout(fully_connected_test_params& p, size_t group_size = 1) {
-        if (p.weights_type == data_types::u8 || p.weights_type == data_types::i8) {
+        if (p.weights_type == data_types::u8 || p.weights_type == data_types::i8 || p.weights_type == data_types::u4 || p.weights_type == data_types::i4) {
             auto scale_shape = p.out_shape.size() == 3 ? ov::PartialShape{p.out_shape[2]} : ov::PartialShape{p.out_shape[1]};
             return layout{ scale_shape, p.default_type, p.default_format };
         } else {
@@ -200,6 +201,10 @@ class FullyConnectedFusingTestOneDNN : public BaseFusingTest<fully_connected_tes
 #define CASE_FC_FP16_INT4_COMP_1 { 1, 128 }, { 1, 128 }, { 128, 128 }, data_types::f16, format::bfyx, data_types::u4, format::oiyx, data_types::f16, format::bfyx
 #define CASE_FC_FP16_INT4_COMP_2 { 2, 128 }, { 2, 128 }, { 128, 128 }, data_types::f16, format::bfyx, data_types::u4, format::oiyx, data_types::f16, format::bfyx
 
+#define CASE_FC_FP16_INT4_SWIGLU_1 { 1, 64 }, { 1, 64 }, { 64, 64 }, data_types::f16, format::bfyx, data_types::u4, format::oiyx, data_types::f16, format::bfyx
+#define CASE_FC_FP16_INT4_SWIGLU_2 { 1, 64}, { 1, 128 }, { 128, 64 }, data_types::f16, format::bfyx, data_types::u4, format::oiyx, data_types::f16, format::bfyx
+#define CASE_FC_FP16_INT4_SWIGLU_3 { 1, 312 }, { 1, 128 }, { 128, 312 }, data_types::f16, format::bfyx, data_types::u4, format::oiyx, data_types::f16, format::bfyx
+
 /* ----------------------------------------------------------------------------------------------------- */
 /* ---------------------------------------- FC cases --------------------------------------------------- */
 /* ----------------------------------------------------------------------------------------------------- */
@@ -903,6 +908,58 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_add_ocl_dynamic, ::testing
     fully_connected_test_params{ DYN_CASE_FC_FP16_3D_2, 2, 3 },
 }));
 
+class fc_fp16_swiglu_ocl_dynamic : public FullyConnectedFusingTest {
+public:
+    void run_test() {
+        auto p = GetParam();
+        auto test_input_layout = get_input_layout(p);
+        auto dynamic_input_layout = layout{ov::PartialShape::dynamic(test_input_layout.get_partial_shape().size()),
+                                           test_input_layout.data_type,
+                                           test_input_layout.format};
+        int64_t swiglu_length = p.weights_shape[0].get_length()/2;
+        auto fc_prim = fully_connected("fc_prim",
+                                       input_info("input"),
+                                       "weights",
+                                       "",
+                                       "scale",
+                                       "",
+                                       data_types::f16,
+                                       get_output_dim_size(p),
+                                       get_input_weights_rank(p));
+        fc_prim.decompression_zero_point_scalar = 8.0f;
+        create_topologies(input_layout("input", dynamic_input_layout),
+                          data("weights", get_mem(get_weights_layout(p))),
+                          data("scale", get_mem(get_scale_layout(p, 64), 0.1)),
+                          fc_prim,
+                          swiglu("swiglu",
+                                 input_info("fc_prim"),
+                                 -1,
+                                 swiglu_length,
+                                 ov::op::internal::GLU::GluType::Swish,
+                                 0,
+                                 tensor()),
+                          reorder("reorder_bfyx", input_info("swiglu"), p.default_format, data_types::f32));
+
+        tolerance = 1.0f;
+        execute(p, true);
+    }
+};
+
+TEST_P(fc_fp16_swiglu_ocl_dynamic, basic) {
+    if (engine.get_device_info().supports_immad)
+        return;
+
+    if (engine.get_device_info().execution_units_count < 128)
+        return;
+    run_test();
+}
+
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_swiglu_ocl_dynamic, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
+    fully_connected_test_params{ CASE_FC_FP16_INT4_SWIGLU_1, 2, 3 },
+    fully_connected_test_params{ CASE_FC_FP16_INT4_SWIGLU_2, 2, 3 },
+    fully_connected_test_params{ CASE_FC_FP16_INT4_SWIGLU_3, 2, 3 },
+}));
+
 class fc_imad_int8_eltwise_add_ocl_dynamic : public FullyConnectedFusingTest {
 public:
     void run_test() {
diff --git a/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp b/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp
index eb0f63c651e50d..c469925083b775 100644
--- a/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp
+++ b/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp
@@ -147,6 +147,12 @@ class BaseFusingTest : public ::testing::TestWithParam<T> {
         } else if (l.data_type == data_types::i8) {
             VF<int8_t> rnd_vec(s.count(), static_cast<int8_t>(fill_value));
             set_values(prim, rnd_vec);
+        } else if (l.data_type == data_types::u4) {
+            VF<uint8_t> rnd_vec(s.count()/2, static_cast<uint8_t>(fill_value));
+            set_values(prim, rnd_vec);
+        } else if (l.data_type == data_types::i4) {
+            VF<int8_t> rnd_vec(s.count()/2, static_cast<int8_t>(fill_value));
+            set_values(prim, rnd_vec);
         } else {
             throw std::runtime_error("get_mem: Unsupported precision");
         }
@@ -186,6 +192,12 @@ class BaseFusingTest : public ::testing::TestWithParam<T> {
         } else if (l.data_type == data_types::u8) {
             VF<uint8_t> rnd_vec = rg.generate_random_1d<uint8_t>(s.count(), min, max);
             set_values(prim, rnd_vec);
+        } else if (l.data_type == data_types::i4) {
+            VF<int8_t> rnd_vec = rg.generate_random_1d<int8_t>(s.count()/2, min, max);
+            set_values(prim, rnd_vec);
+        } else if (l.data_type == data_types::u4) {
+            VF<uint8_t> rnd_vec = rg.generate_random_1d<uint8_t>(s.count()/2, min, max);
+            set_values(prim, rnd_vec);
         }
 
         return prim;

From 94f647dc5f0e3a349aadabf4ae377aa6a2b063b4 Mon Sep 17 00:00:00 2001
From: Pawel Raasz <pawel.raasz@intel.com>
Date: Fri, 6 Dec 2024 16:50:59 +0100
Subject: [PATCH 09/23] [core] Extend Core API to accept std::filesystem::path
 when build with cpp17 (#27950)

### Details:
- The `ov::Core` accepts `std::filesytem::path` in functions where
string as path is used.

### Tickets:
 - CVS-157908

---------

Signed-off-by: Pawel Raasz <pawel.raasz@intel.com>
---
 .../shape_inference/include/ov_optional.hpp   |  4 ++
 src/core/tests/pattern.cpp                    |  6 +-
 .../include/openvino/runtime/core.hpp         | 50 ++++++++++++++
 .../tests/functional/ov_core_test.cpp         | 69 +++++++++++++++++--
 .../tests/functional/ov_extension_test.cpp    |  6 ++
 5 files changed, 126 insertions(+), 9 deletions(-)

diff --git a/src/core/shape_inference/include/ov_optional.hpp b/src/core/shape_inference/include/ov_optional.hpp
index f7f8b474f9a5a6..15973ae0c8a5f8 100644
--- a/src/core/shape_inference/include/ov_optional.hpp
+++ b/src/core/shape_inference/include/ov_optional.hpp
@@ -7,6 +7,9 @@
 #include <cstddef>
 
 namespace ov {
+#ifdef OPENVINO_CPP_17_VER
+using optional = std::optional;
+#else
 
 /**
  * @brief Store optional object of type T (basic version of std::optional).
@@ -132,4 +135,5 @@ class optional {
     bool m_has_value = false;
     Storage<T> m_opt{};
 };
+#endif
 }  // namespace ov
diff --git a/src/core/tests/pattern.cpp b/src/core/tests/pattern.cpp
index 050c36b65baad1..982e59b55f0f97 100644
--- a/src/core/tests/pattern.cpp
+++ b/src/core/tests/pattern.cpp
@@ -558,8 +558,8 @@ TEST(pattern, multiple_optionals_in_row) {
 
     // Pattern:
     auto in = wrap_type<v0::Parameter>();
-    auto pattern_convert = optional<v0::Convert>(in);
-    auto pattern_relu = optional<v0::Relu>(pattern_convert);
+    auto pattern_convert = pattern::optional<v0::Convert>(in);
+    auto pattern_relu = pattern::optional<v0::Relu>(pattern_convert);
     auto pattern_sigmoid = wrap_type<v0::Sigmoid>({pattern_relu});
 
     // Test:
@@ -1255,4 +1255,4 @@ TEST(pattern, pattern_optional_root) {
 
     // Should perfectly match
     ASSERT_TRUE(tm.match(pattern_relu, model_relu));
-}
\ No newline at end of file
+}
diff --git a/src/inference/include/openvino/runtime/core.hpp b/src/inference/include/openvino/runtime/core.hpp
index f0ba27c1cf5daa..c13432d664e736 100644
--- a/src/inference/include/openvino/runtime/core.hpp
+++ b/src/inference/include/openvino/runtime/core.hpp
@@ -25,6 +25,10 @@
 #include "openvino/runtime/remote_context.hpp"
 #include "openvino/runtime/tensor.hpp"
 
+#ifdef OPENVINO_CPP_VER_17
+#    include <filesystem>
+#endif
+
 namespace ov {
 
 /**
@@ -95,9 +99,18 @@ class OPENVINO_RUNTIME_API Core {
      *  * TF (*.pb)
      *  * TFLite (*.tflite)
      * @return A model.
+     * @{
      */
     std::shared_ptr<ov::Model> read_model(const std::string& model_path, const std::string& bin_path = {}) const;
 
+#ifdef OPENVINO_CPP_VER_17
+    template <class Path, std::enable_if_t<std::is_same_v<Path, std::filesystem::path>>* = nullptr>
+    std::shared_ptr<ov::Model> read_model(const Path& model_path, const Path& bin_path = {}) const {
+        return read_model(model_path.string(), bin_path.string());
+    }
+#endif
+    /// @}
+
     /**
      * @brief Reads models from IR / ONNX / PDPD / TF / TFLite formats.
      * @param model String with a model in IR / ONNX / PDPD / TF / TFLite format.
@@ -197,6 +210,13 @@ class OPENVINO_RUNTIME_API Core {
      */
     CompiledModel compile_model(const std::string& model_path, const AnyMap& properties = {});
 
+#ifdef OPENVINO_CPP_VER_17
+    template <class Path, std::enable_if_t<std::is_same_v<Path, std::filesystem::path>>* = nullptr>
+    auto compile_model(const Path& model_path, const AnyMap& properties = {}) const {
+        return compile_model(model_path.string(), properties);
+    }
+#endif
+
 #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
     CompiledModel compile_model(const std::wstring& model_path, const AnyMap& properties = {});
 #endif
@@ -223,6 +243,13 @@ class OPENVINO_RUNTIME_API Core {
         return compile_model(model_path, AnyMap{std::forward<Properties>(properties)...});
     }
 
+#ifdef OPENVINO_CPP_VER_17
+    template <class Path, class... Properties, std::enable_if_t<std::is_same_v<Path, std::filesystem::path>>* = nullptr>
+    auto compile_model(const Path& model_path, Properties&&... properties) {
+        return compile_model(model_path.string(), std::forward<Properties>(properties)...);
+    }
+#endif
+
 #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
     template <typename... Properties>
     util::EnableIfAllStringAny<CompiledModel, Properties...> compile_model(const std::wstring& model_path,
@@ -250,6 +277,13 @@ class OPENVINO_RUNTIME_API Core {
                                 const std::string& device_name,
                                 const AnyMap& properties = {});
 
+#ifdef OPENVINO_CPP_VER_17
+    template <class Path, std::enable_if_t<std::is_same_v<Path, std::filesystem::path>>* = nullptr>
+    auto compile_model(const Path& model_path, const std::string& device_name, const AnyMap& properties = {}) {
+        return compile_model(model_path.string(), device_name, properties);
+    }
+#endif
+
 #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
     CompiledModel compile_model(const std::wstring& model_path,
                                 const std::string& device_name,
@@ -279,6 +313,13 @@ class OPENVINO_RUNTIME_API Core {
         return compile_model(model_path, device_name, AnyMap{std::forward<Properties>(properties)...});
     }
 
+#ifdef OPENVINO_CPP_VER_17
+    template <class Path, class... Properties, std::enable_if_t<std::is_same_v<Path, std::filesystem::path>>* = nullptr>
+    auto compile_model(const Path& model_path, const std::string& device_name, Properties&&... properties) {
+        return compile_model(model_path.string(), device_name, std::forward<Properties>(properties)...);
+    }
+#endif
+
 #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
     template <typename... Properties>
     util::EnableIfAllStringAny<CompiledModel, Properties...> compile_model(const std::wstring& model_path,
@@ -359,9 +400,18 @@ class OPENVINO_RUNTIME_API Core {
     /**
      * @brief Registers an extension to a Core object.
      * @param library_path Path to the library with ov::Extension.
+     * @{
      */
     void add_extension(const std::string& library_path);
 
+#ifdef OPENVINO_CPP_VER_17
+    template <class Path, std::enable_if_t<std::is_same_v<Path, std::filesystem::path>>* = nullptr>
+    void add_extension(const Path& model_path) {
+        add_extension(model_path.string());
+    }
+#endif
+    /// @}
+
 #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
     /**
      * @brief Registers an extension to a Core object.
diff --git a/src/inference/tests/functional/ov_core_test.cpp b/src/inference/tests/functional/ov_core_test.cpp
index 26eb38e3fd13e5..60f91b85b3338a 100644
--- a/src/inference/tests/functional/ov_core_test.cpp
+++ b/src/inference/tests/functional/ov_core_test.cpp
@@ -8,9 +8,26 @@
 
 #include "common_test_utils/common_utils.hpp"
 #include "common_test_utils/file_utils.hpp"
+#include "functional_test_utils/test_model/test_model.hpp"
 #include "openvino/runtime/core.hpp"
 #include "openvino/util/file_util.hpp"
 
+class CoreBaseTest : public testing::Test {
+protected:
+    void generate_test_model_files(const std::string& name) {
+        auto prefix = ov::test::utils::generateTestFilePrefix();
+        model_file_name = prefix + name + ".xml";
+        weight_file_name = prefix + name + ".bin";
+        ov::test::utils::generate_test_model(model_file_name, weight_file_name);
+    }
+
+    void TearDown() override {
+        ov::test::utils::removeIRFiles(model_file_name, weight_file_name);
+    }
+
+    std::string model_file_name, weight_file_name;
+};
+
 #ifndef OPENVINO_STATIC_LIBRARY
 
 static void create_plugin_xml(const std::string& file_name, const std::string& plugin_name = "1") {
@@ -33,7 +50,7 @@ static void remove_plugin_xml(const std::string& file_name) {
     ov::test::utils::removeFile(file_name);
 }
 
-TEST(CoreBaseTest, LoadPluginXML) {
+TEST_F(CoreBaseTest, LoadPluginXML) {
     std::string xml_file_name = "test_plugin.xml";
     std::string xml_file_path =
         ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits<char>::file_separator + xml_file_name;
@@ -42,7 +59,7 @@ TEST(CoreBaseTest, LoadPluginXML) {
     remove_plugin_xml(xml_file_path);
 }
 
-TEST(CoreBaseTest, LoadPluginDifferentXMLExtension) {
+TEST_F(CoreBaseTest, LoadPluginDifferentXMLExtension) {
     std::string xml_file_name = "test_plugin.test";
     std::string xml_file_path =
         ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits<char>::file_separator + xml_file_name;
@@ -51,7 +68,7 @@ TEST(CoreBaseTest, LoadPluginDifferentXMLExtension) {
     remove_plugin_xml(xml_file_path);
 }
 
-TEST(CoreBaseTest, LoadAbsoluteOVPathPluginXML) {
+TEST_F(CoreBaseTest, LoadAbsoluteOVPathPluginXML) {
     std::string xml_file_name = "test_plugin.xml";
     std::string xml_file_path =
         ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits<char>::file_separator + xml_file_name;
@@ -60,7 +77,7 @@ TEST(CoreBaseTest, LoadAbsoluteOVPathPluginXML) {
     remove_plugin_xml(xml_file_path);
 }
 
-TEST(CoreBaseTest, LoadAbsoluteCWPathPluginXML) {
+TEST_F(CoreBaseTest, LoadAbsoluteCWPathPluginXML) {
     std::string xml_file_name = "test_plugin.xml";
     std::string xml_file_path =
         ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits<char>::file_separator + xml_file_name;
@@ -69,7 +86,7 @@ TEST(CoreBaseTest, LoadAbsoluteCWPathPluginXML) {
     remove_plugin_xml(xml_file_path);
 }
 
-TEST(CoreBaseTest, LoadRelativeCWPathPluginXML) {
+TEST_F(CoreBaseTest, LoadRelativeCWPathPluginXML) {
     std::string xml_file_name = "test_plugin.xml";
     std::string xml_file_path =
         ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits<char>::file_separator + xml_file_name;
@@ -78,7 +95,7 @@ TEST(CoreBaseTest, LoadRelativeCWPathPluginXML) {
     remove_plugin_xml(xml_file_path);
 }
 
-TEST(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) {
+TEST_F(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) {
     std::string xml_file_name = "test_plugin.xml";
     std::string cwd_file_path =
         ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits<char>::file_separator + xml_file_name;
@@ -96,3 +113,43 @@ TEST(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) {
 }
 
 #endif
+
+#if defined(OPENVINO_CPP_VER_17) && defined(ENABLE_OV_IR_FRONTEND)
+namespace ov::test {
+TEST_F(CoreBaseTest, read_model_with_std_fs_path) {
+    generate_test_model_files("test-model");
+
+    const auto model_path = std::filesystem::path(model_file_name);
+    const auto weight_path = std::filesystem::path(weight_file_name);
+
+    ov::Core core;
+    {
+        const auto model = core.read_model(model_path);
+        EXPECT_NE(model, nullptr);
+    }
+    {
+        const auto model = core.read_model(model_path, weight_path);
+        EXPECT_NE(model, nullptr);
+    }
+}
+
+TEST_F(CoreBaseTest, compile_model_with_std_fs_path) {
+    generate_test_model_files("model2");
+
+    const auto model_path = std::filesystem::path(model_file_name);
+    const auto weight_path = std::filesystem::path(weight_file_name);
+
+    ov::Core core;
+    {
+        const auto model = core.compile_model(model_path);
+        EXPECT_TRUE(model);
+    }
+    {
+        const auto devices = core.get_available_devices();
+
+        const auto model = core.compile_model(model_path, devices.at(0), ov::AnyMap{});
+        EXPECT_TRUE(model);
+    }
+}
+}  // namespace ov::test
+#endif
diff --git a/src/inference/tests/functional/ov_extension_test.cpp b/src/inference/tests/functional/ov_extension_test.cpp
index 6f93a8acdaf2fa..b840c430d092e9 100644
--- a/src/inference/tests/functional/ov_extension_test.cpp
+++ b/src/inference/tests/functional/ov_extension_test.cpp
@@ -82,6 +82,12 @@ class CustomReLU : public ov::op::Op {
 };
 
 #if defined(ENABLE_OV_IR_FRONTEND)
+#    ifdef OPENVINO_CPP_VER_17
+TEST_F(OVExtensionTests, ReshapeIRWithNewExtensionsPathLib) {
+    core.add_extension(std::filesystem::path(getOVExtensionPath()));
+    test();
+}
+#    endif
 
 TEST_F(OVExtensionTests, ReshapeIRWithNewExtensionsLib) {
     core.add_extension(getOVExtensionPath());

From e8fa9f7b84d1d19e4581f56ef4dd8e88934b878e Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Fri, 6 Dec 2024 20:29:49 +0400
Subject: [PATCH 10/23] [TF FE] Run HSVToRGB tests on all platforms (#27945)

**Details:** Run HSVToRGB tests on all platforms

**Ticket:** TBD

---------

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 .../tensorflow_tests/test_tf_HSVToRGB.py      | 53 +++++++------------
 1 file changed, 19 insertions(+), 34 deletions(-)

diff --git a/tests/layer_tests/tensorflow_tests/test_tf_HSVToRGB.py b/tests/layer_tests/tensorflow_tests/test_tf_HSVToRGB.py
index 9f3ab9845fb24f..17df8c52430ec5 100644
--- a/tests/layer_tests/tensorflow_tests/test_tf_HSVToRGB.py
+++ b/tests/layer_tests/tensorflow_tests/test_tf_HSVToRGB.py
@@ -1,32 +1,28 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import platform
-
 import numpy as np
 import pytest
 import tensorflow as tf
 from common.tf_layer_test_class import CommonTFLayerTest
 
+rng = np.random.default_rng(23345)
+
+
 class TestHSVToRGB(CommonTFLayerTest):
     def _prepare_input(self, inputs_info):
         assert 'images:0' in inputs_info
-        if self.special_case == "Black Image":
-            images_shape = inputs_info['images:0']
-            inputs_data = {}
-            inputs_data['images:0'] = np.zeros(images_shape).astype(self.input_type) 
-        elif self.special_case == "Grayscale Image":
-            images_shape = inputs_info['images:0']
-            inputs_data = {}
+        images_shape = inputs_info['images:0']
+        inputs_data = {}
+        if self.special_case == 'Black Image':
+            inputs_data['images:0'] = np.zeros(images_shape).astype(self.input_type)
+        elif self.special_case == 'Grayscale Image':
             inputs_data['images:0'] = np.broadcast_to([0, 0, 0.5], images_shape).astype(self.input_type)
         else:
-            images_shape = inputs_info['images:0']
-            inputs_data = {} 
-            inputs_data['images:0'] = np.random.rand(*images_shape).astype(self.input_type)
-            
+            inputs_data['images:0'] = rng.uniform(0.0, 1.0, images_shape).astype(self.input_type)
         return inputs_data
 
-    def create_hsv_to_rgb_net(self, input_shape, input_type, special_case=False):
+    def create_hsv_to_rgb_net(self, input_shape, input_type, special_case):
         self.special_case = special_case
         self.input_type = input_type
         tf.compat.v1.reset_default_graph()
@@ -39,27 +35,16 @@ def create_hsv_to_rgb_net(self, input_shape, input_type, special_case=False):
 
         return tf_net, None
 
-    # Each input is a tensor of with values in [0,1].
-    # The last dimension must be size 3.
-    test_data_basic = [
-        dict(input_shape=[7, 7, 3], input_type=np.float32, special_case="Black Image"),
-        dict(input_shape=[7, 7, 3], input_type=np.float32, special_case="Grayscale Image"),
-        dict(input_shape=[5, 5, 3], input_type=np.float32),
-        dict(input_shape=[5, 23, 27, 3], input_type=np.float64),
-        dict(input_shape=[3, 4, 13, 15, 3], input_type=np.float64),
-    ]
-
-    @pytest.mark.parametrize("params", test_data_basic)
+    @pytest.mark.parametrize('input_shape', [[3], [5, 3], [4, 5, 3], [5, 21, 21, 3]])
+    @pytest.mark.parametrize('input_type', [np.float16, np.float32, np.float64])
+    @pytest.mark.parametrize('special_case', [None, 'Black Image', 'Grayscale Image'])
     @pytest.mark.precommit
     @pytest.mark.nightly
-    @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l',
-                                                                                                     'aarch64',
-                                                                                                     'arm64', 'ARM64'],
-                       reason='Ticket - 126314, 132699')
-    def test_hsv_to_rgb_basic(self, params, ie_device, precision, ir_version, temp_dir,
-                                   use_legacy_frontend):
+    def test_hsv_to_rgb_basic(self, input_shape, input_type, special_case,
+                              ie_device, precision, ir_version, temp_dir,
+                              use_legacy_frontend):
         if ie_device == 'GPU':
-            pytest.skip("Accuracy mismatch on GPU")
-        self._test(*self.create_hsv_to_rgb_net(**params),
+            pytest.skip('158898: accuracy issue on GPU')
+        self._test(*self.create_hsv_to_rgb_net(input_shape, input_type, special_case),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
-                   use_legacy_frontend=use_legacy_frontend)
+                   use_legacy_frontend=use_legacy_frontend, custom_eps=3 * 1e-3)

From b840082ac11b1608f349d9554b020498c328164f Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mingyu.kim@intel.com>
Date: Mon, 9 Dec 2024 14:09:30 +0900
Subject: [PATCH 11/23] [GPU] Integrate dynamic quantization for onednn
 (#26940)

### Details:
 - Integrated grouped dynamic quantization from onednn
 - Integrated asymmetric per-token dynamic quantization from onednn
 - Those are not enabled by default, yet

### Tickets:
 - 148732, 157869, 157589
---
 .../op/fully_connected_compressed.hpp         |   1 +
 .../intel_gpu/primitives/dynamic_quantize.hpp |  13 +-
 .../intel_gpu/primitives/fully_connected.hpp  |  18 +++
 .../intel_gpu/runtime/debug_configuration.hpp |   1 +
 .../prepare_primitive_fusing.cpp              |   2 +
 .../src/graph/impls/ocl/dynamic_quantize.cpp  |   8 +-
 .../impls/onednn/fully_connected_onednn.cpp   |  47 +++++--
 .../impls/onednn/fully_connected_onednn.hpp   |   2 +-
 .../cl_kernels/dynamic_quantize_gpu_opt.cl    | 133 ++++++++++++++++--
 .../cl_kernels/dynamic_quantize_gpu_ref.cl    |  50 ++++---
 .../dynamic_quantize_kernel_opt.cpp           |  56 +++++---
 .../dynamic_quantize_kernel_ref.cpp           |  18 ++-
 .../fully_connected_kernel_bf_tiled.cpp       |  20 +--
 .../src/plugin/ops/dynamic_quantize.cpp       |   3 +-
 .../src/plugin/ops/fully_connected.cpp        |   4 +-
 .../intel_gpu/src/plugin/program_builder.cpp  |   4 +
 .../dynamic_quantize_fully_connected.cpp      |  30 ++--
 .../op/fully_connected_compressed.cpp         |   5 +-
 .../src/plugin/transformations_pipeline.cpp   |  22 ++-
 .../src/runtime/debug_configuration.cpp       |   3 +
 .../src/runtime/execution_config.cpp          |   7 +-
 .../dynamic/matmul_weights_decompression.cpp  |  33 +++--
 .../test_cases/dynamic_quantize_gpu_test.cpp  |  61 +++++---
 .../test_cases/fully_connected_gpu_test.cpp   |  24 ++--
 .../unit/test_cases/hash_key_gpu_test.cpp     |   8 +-
 25 files changed, 420 insertions(+), 153 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp
index 1112a3785317a3..e58c6ab4cb17f1 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/op/fully_connected_compressed.hpp
@@ -22,6 +22,7 @@ class FullyConnectedCompressed : public FullyConnected {
                              const ov::Output<Node> &w_decompression_scale,
                              const ov::Output<Node> &w_decompression_zero_point,
                              const ov::Output<Node> &a_decompression_scale,
+                             const ov::Output<Node> &a_decompression_zero_point,
                              const ov::element::Type output_type = ov::element::undefined);
 
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp
index 79af223e32cdaa..8dd1ebf2809782 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp
@@ -26,9 +26,11 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
     /// @param output_size Output data size of the primitive
     dynamic_quantize(const primitive_id& id,
            const input_info& input,
-           const Attributes& attrs)
+           const Attributes& attrs,
+           const size_t input_size = 3)
            : primitive_base(id, {input})
-           , attrs(attrs) {
+           , attrs(attrs)
+           , input_size(input_size) {
         num_outputs = 2;
         if (attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric &&
             attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar)
@@ -36,6 +38,7 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
     }
 
     Attributes attrs;
+    size_t input_size;
 
     size_t hash() const override {
         size_t seed = primitive::hash();
@@ -46,6 +49,7 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
         seed = hash_combine(seed, attrs.scale_dt.hash());
         seed = hash_combine(seed, attrs.zp_dt.hash());
         seed = hash_combine(seed, attrs.output_storage_type);
+        seed = hash_combine(seed, input_size);
 
         return seed;
     }
@@ -62,7 +66,8 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
                attrs.quantization_dt == rhs_casted.attrs.quantization_dt &&
                attrs.scale_dt == rhs_casted.attrs.scale_dt &&
                attrs.zp_dt == rhs_casted.attrs.zp_dt &&
-               attrs.quantization_type == rhs_casted.attrs.quantization_type;;
+               attrs.quantization_type == rhs_casted.attrs.quantization_type &&
+               input_size == rhs_casted.input_size;
     }
 
     void save(BinaryOutputBuffer& ob) const override {
@@ -75,6 +80,7 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
         ob << make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type));
         ob << attrs.scales_zp_output_order;
         ob << attrs.group_sizes;
+        ob << input_size;
     }
 
     void load(BinaryInputBuffer& ib) override {
@@ -87,6 +93,7 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
         ib >> make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type));
         ib >> attrs.scales_zp_output_order;
         ib >> attrs.group_sizes;
+        ib >> input_size;
     }
 };
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp
index e39078cb1011cc..0819a39534696d 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp
@@ -96,6 +96,7 @@ struct fully_connected : public primitive_base<fully_connected> {
           decompression_scale(decompression_scale),
           decompression_zero_point(decompression_zero_point),
           dynamic_quantized_activation(false),
+          dynamic_quantized_activation_zp(false),
           input_size(input_size),
           weights_rank(weights_rank) {
         OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input");
@@ -109,6 +110,7 @@ struct fully_connected : public primitive_base<fully_connected> {
     /// @param compression_scale Primitive id containing scale factors for weights decompression.
     /// @param compression_zero_point Primitive id containing zero points for weights decompression.
     /// @param activation_scale Primitive id containing scale factor for activation.
+    /// @param activation_zero_point Primitive id containing zero point for activation.
     fully_connected(const primitive_id& id,
                     const input_info& input,
                     const primitive_id& weights,
@@ -116,6 +118,7 @@ struct fully_connected : public primitive_base<fully_connected> {
                     const primitive_id& decompression_scale,
                     const primitive_id& decompression_zero_point,
                     const input_info& activation_scale,
+                    const input_info& activation_zero_point,
                     const data_types data_type,
                     const size_t input_size = 2,
                     const size_t weights_rank = 2)
@@ -126,11 +129,15 @@ struct fully_connected : public primitive_base<fully_connected> {
           decompression_scale(decompression_scale),
           decompression_zero_point(decompression_zero_point),
           dynamic_quantized_activation(false),
+          dynamic_quantized_activation_zp(false),
           activation_scale(activation_scale),
+          activation_zero_point(activation_zero_point),
           input_size(input_size),
           weights_rank(weights_rank) {
         if (activation_scale.is_valid())
             dynamic_quantized_activation = true;
+        if (activation_zero_point.is_valid())
+            dynamic_quantized_activation_zp = true;
 
         OPENVINO_ASSERT(!decompression_scale.empty(), "[GPU] Compressed fully connected requires at least decompression scale input");
     }
@@ -144,7 +151,9 @@ struct fully_connected : public primitive_base<fully_connected> {
     primitive_id decompression_scale = "";
     primitive_id decompression_zero_point = "";
     bool dynamic_quantized_activation = false;
+    bool dynamic_quantized_activation_zp = false;
     input_info activation_scale = {"", 0};
+    input_info activation_zero_point = {"", 0};
     optional_value<float> decompression_zero_point_scalar = optional_value<float>();
 
     /// @brief Primitive dimension size.
@@ -161,6 +170,7 @@ struct fully_connected : public primitive_base<fully_connected> {
         seed = hash_combine(seed, !decompression_scale.empty());
         seed = hash_combine(seed, !decompression_zero_point.empty());
         seed = hash_combine(seed, activation_scale.is_valid());
+        seed = hash_combine(seed, activation_zero_point.is_valid());
         seed = hash_combine(seed, decompression_zero_point_scalar.has_value());
         seed = hash_combine(seed, decompression_zero_point_scalar.value_or(0.0f));
         return seed;
@@ -179,6 +189,7 @@ struct fully_connected : public primitive_base<fully_connected> {
                decompression_scale.empty() == rhs_casted.decompression_scale.empty() &&
                decompression_zero_point.empty() == rhs_casted.decompression_zero_point.empty() &&
                activation_scale.is_valid() == rhs_casted.activation_scale.is_valid() &&
+               activation_zero_point.is_valid() == rhs_casted.activation_zero_point.is_valid() &&
                decompression_zero_point_scalar.value_or(0.0f) == rhs_casted.decompression_zero_point_scalar.value_or(0.0f);
     }
 
@@ -190,9 +201,11 @@ struct fully_connected : public primitive_base<fully_connected> {
         ob << decompression_scale;
         ob << decompression_zero_point;
         ob << activation_scale;
+        ob << activation_zero_point;
         ob << input_size;
         ob << weights_rank;
         ob << dynamic_quantized_activation;
+        ob << dynamic_quantized_activation_zp;
 
         if (decompression_zero_point_scalar.has_value()) {
             ob << true;
@@ -211,9 +224,11 @@ struct fully_connected : public primitive_base<fully_connected> {
         ib >> decompression_scale;
         ib >> decompression_zero_point;
         ib >> activation_scale;
+        ib >> activation_zero_point;
         ib >> input_size;
         ib >> weights_rank;
         ib >> dynamic_quantized_activation;
+        ib >> dynamic_quantized_activation_zp;
 
         bool has_value;
         ib >> has_value;
@@ -243,6 +258,9 @@ struct fully_connected : public primitive_base<fully_connected> {
         if (activation_scale.is_valid())
             ret.push_back(activation_scale);
 
+        if (activation_zero_point.is_valid())
+            ret.push_back(activation_zero_point);
+
         return ret;
     }
 };
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
index a7a8ae1f229a72..52d828353fa155 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -146,6 +146,7 @@ class debug_configuration {
     std::vector<std::string> dynamic_quantize_layers_without_onednn;  // Specify Fully-connected layers which enable Dynamic quantization
     int use_kv_cache_compression;                               // Enable KV-cache compression
     int dynamic_quantize_group_size;                            // Enable Dynamic quantization for fully connected primitive by specified group size
+    int dynamic_quantize_asym;                                  // Use asymmetric dynamic quantization
     int disable_horizontal_fc_fusion;                           // Disable fc horizontal fusion
     int disable_fc_swiglu_fusion;                               // Disable swiglu fusion to fc
     std::set<int64_t> dump_iteration;                           // Dump n-th execution of network.
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
index 29b7cf58a19b54..93f0905b3a1ef7 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -463,7 +463,9 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
                 if (desc->decompression_zero_point_scalar.has_value())
                     fc_with_bias_prim->decompression_zero_point_scalar = desc->decompression_zero_point_scalar.value();
                 fc_with_bias_prim->activation_scale = desc->activation_scale;
+                fc_with_bias_prim->activation_zero_point = desc->activation_zero_point;
                 fc_with_bias_prim->dynamic_quantized_activation = desc->dynamic_quantized_activation;
+                fc_with_bias_prim->dynamic_quantized_activation_zp = desc->dynamic_quantized_activation_zp;
             }
             auto& new_fc_node = p.get_or_create(fc_with_bias_prim);
             fuse_bias_f(fc, new_fc_node, bias_node, eltw_node);
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp
index b9fe00ac525720..ca628a48ac76e0 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp
@@ -35,6 +35,7 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl<dynamic_quantize> {
 
     static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
         auto params = get_default_params<kernel_selector::dynamic_quantize_params>(impl_param, is_shape_agnostic);
+        const auto& primitive = impl_param.typed_desc<dynamic_quantize>();
         params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1)));
 
         // In Some model, the feature size could be dynamic in input0.
@@ -48,6 +49,10 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl<dynamic_quantize> {
         if (impl_param.output_layouts.size() > 2)
             params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(2)));
 
+        // Keep 2d data as bf layout
+        if (primitive->input_size == 2)
+            params.outputs[0] = params.outputs[0].FlattenFeatureAndSpatials();
+
         const auto& desc = impl_param.typed_desc<dynamic_quantize>();
         params.group_sizes = desc->attrs.group_sizes;
         params.scales_output_order = desc->attrs.scales_zp_output_order;
@@ -68,7 +73,8 @@ namespace detail {
 attach_dynamic_quantize_impl::attach_dynamic_quantize_impl() {
     auto types = {
         data_types::f16,
-        data_types::i8
+        data_types::i8,
+        data_types::u8
     };
 
     auto formats = {
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
index 6b93b279129812..6cca9848af3472 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
@@ -83,10 +83,16 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
             if (prim->activation_scale.is_valid()) {
                 auto activation_scale_idx = idx++;
                 auto act_scale_mem = instance.dep_memory_ptr(activation_scale_idx);
-                // TODO: handle group_size here
-                dnnl::memory::desc desc = onednn::layout_to_memory_desc(act_scale_mem->get_layout(), dnnl::memory::format_tag::a, true);
+                dnnl::memory::desc desc = onednn::layout_to_memory_desc(act_scale_mem->get_layout(), dnnl::memory::format_tag::ab, true);
                 args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, act_scale_mem->get_onednn_memory(desc)});
             }
+
+            if (prim->activation_zero_point.is_valid()) {
+                auto activation_zp_idx = idx++;
+                auto act_zp_mem = instance.dep_memory_ptr(activation_zp_idx);
+                dnnl::memory::desc desc = onednn::layout_to_memory_desc(act_zp_mem->get_layout(), dnnl::memory::format_tag::ab, true);
+                args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC_0, act_zp_mem->get_onednn_memory(desc)});
+            }
         }
 
         return args;
@@ -245,6 +251,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
         ob << has_bias;
         ob << is_compressed;
         ob << prim->dynamic_quantized_activation;
+        ob << prim->dynamic_quantized_activation_zp;
 
         bool has_decompression_scale = !prim->decompression_scale.empty();
         if (has_decompression_scale) {
@@ -271,10 +278,12 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
         bool has_bias = false;
         bool is_compressed = false;
         bool dynamic_quantized_activation;
+        bool dynamic_quantized_activation_zp;
         ib >> input_size;
         ib >> has_bias;
         ib >> is_compressed;
         ib >> dynamic_quantized_activation;
+        ib >> dynamic_quantized_activation_zp;
 
         const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
         auto prim = impl_params->typed_desc<fully_connected>();
@@ -293,11 +302,12 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
 
         bool has_decompression_zp = !prim->decompression_zero_point.empty() || prim->decompression_zero_point_scalar.has_value();
         auto& arg = impl_params->get_program().get_node(impl_params->desc->id).as<fully_connected>();
-        int idx = !arg.bias_term() ? 3 : 4;
+        int idx = !arg.bias_term() ? 2 : 3;
 
         if (has_decompression_zp) {
             ib >> make_data(&_dzp_data_type, sizeof(dnnl::memory::data_type));
-            auto dzp_layout = arg.get_dependency(idx++).get_output_layout();
+            auto decompression_zp_idx = ++idx;
+            auto dzp_layout = arg.get_dependency(decompression_zp_idx).get_output_layout();
 
             if (dzp_layout.count() == 1) {
                 _attrs->set_zero_points(DNNL_ARG_WEIGHTS, COMMON, dnnl::memory::dims{}, _dzp_data_type);
@@ -312,12 +322,17 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
         }
 
         if (dynamic_quantized_activation) {
-            // TODO: it supports per-token activation scale only
+            auto src_scale_idx = ++idx;
             auto partial_shape = impl_params->get_input_layout(0).get_partial_shape();
             auto innermost_len = partial_shape[partial_shape.size() - 1].get_length();
-
-            auto act_scale_data_type = convert_data_type(impl_params->get_input_layout(idx).data_type);
-            _attrs->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, innermost_len}, act_scale_data_type);
+            auto& src_scale_shape = impl_params->input_layouts[src_scale_idx].get_partial_shape();
+            int src_scale_ngroups = src_scale_shape[src_scale_shape.size() - 1].get_length();
+            int src_group_size = innermost_len / src_scale_ngroups;
+
+            auto act_scale_data_type = convert_data_type(impl_params->get_input_layout(src_scale_idx).data_type);
+            _attrs->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, src_group_size}, act_scale_data_type);
+            if (dynamic_quantized_activation_zp)
+                _attrs->set_zero_points(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, src_group_size}, dnnl::memory::data_type::u8);
         }
 
         if (is_compressed) {
@@ -387,15 +402,21 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
             }
 
             if (prim->dynamic_quantized_activation) {
-                // Note: it supports per-token activation scale only
-                ++idx;
-                auto partial_shape = impl_params.input_layouts[0].get_partial_shape();
+                auto src_scale_idx = ++idx;
+                auto& partial_shape = impl_params.input_layouts[0].get_partial_shape();
                 auto innermost_len = partial_shape[partial_shape.size() - 1].get_length();
+                auto& src_scale_shape = impl_params.input_layouts[src_scale_idx].get_partial_shape();
+                int src_scale_ngroups = src_scale_shape[src_scale_shape.size() - 1].get_length();
+                int src_group_size = innermost_len / src_scale_ngroups;
 
-                auto act_scale_data_type = convert_data_type(impl_params.input_layouts[idx].data_type);
-                attr->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, innermost_len}, act_scale_data_type);
+                auto act_scale_data_type = convert_data_type(impl_params.input_layouts[src_scale_idx].data_type);
+                attr->set_scales(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, src_group_size}, act_scale_data_type);
+
+                if (prim->activation_zero_point.is_valid())
+                    attr->set_zero_points(DNNL_ARG_SRC, GROUPED, dnnl::memory::dims{1, src_group_size}, dnnl::memory::data_type::u8);
             }
 
+
             auto prim_desc = get_matmul_primitive_descriptor(impl_params, impl_params.prog->get_engine(),
                                                              prim->input_size, !prim->bias.empty(), *attr);
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp
index 17498831a542d1..62129866927ea4 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp
@@ -48,7 +48,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager {
                          one_of(wei_dt, {data_types::i8, data_types::u8}) &&
                          one_of(out_dt, {data_types::f16, data_types::f32, data_types::i32, data_types::i8, data_types::u8});
         bool compressed_case = fc_prim->compressed_weights &&
-                               one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8}) &&
+                               one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8, data_types::u8}) &&
                                one_of(wei_dt, {data_types::u8, data_types::i8, data_types::u4, data_types::i4}) &&
                                one_of(out_dt, {data_types::f16, data_types::f32, data_types::u8, data_types::i8});
         if (!f16f16_case && !f32f32_case && !u8s8_case && !compressed_case)
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl
index 6db1790844e501..22c620d712770c 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl
@@ -4,77 +4,180 @@
 
 #include "include/batch_headers/fetch_data.cl"
 
-#if OUTPUT_DIMS != 4
+#if OUTPUT_DIMS != 4 && OUTPUT_DIMS != 2
 #error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension"
 #endif
 
 #define VLOAD_N CAT(vload, VEC_SIZE)
 #define VSTORE_N CAT(vstore, VEC_SIZE)
+#define CONVERT_UCHAR_N CAT(convert_uchar, VEC_SIZE)
 #define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE)
 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
 #define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x)
 
+#if QUANTIZE_GROUP_SIZE <= 128
+
+#if ASYMMETRIC_QUANTIZATION
+#error "UNIMPLMENTED: asymmetric quantization when group size is small"
+#endif
+
+KERNEL(dynamic_quantize_gpu_opt)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output,
+    __global OUTPUT1_TYPE* output_scale
+    ) {
+
+#if OUTPUT_DIMS == 2
+    const uint b = get_global_id(0);
+    const uint f_grp = get_global_id(1);
+    const uint input_offset = INPUT0_GET_INDEX(b, f_grp * QUANTIZE_GROUP_SIZE, 0, 0);
+    const uint output_offset = OUTPUT_GET_INDEX(b, f_grp * QUANTIZE_GROUP_SIZE, 0, 0);
+#else
+    const uint bf = get_global_id(0);
+    const uint b = bf / INPUT0_FEATURE_NUM;
+    const uint f = bf % INPUT0_FEATURE_NUM;
+    const uint y_grp = get_global_id(1);
+    const uint input_offset = INPUT0_GET_INDEX(b, f, y_grp * QUANTIZE_GROUP_SIZE, 0);
+    const uint output_offset = OUTPUT_GET_INDEX(b, f, y_grp * QUANTIZE_GROUP_SIZE, 0);
+
+#endif
+    const uint quantize_block = QUANTIZE_GROUP_SIZE / 4;
+    half4 input_0[quantize_block];
+    char4 quantized_value[quantize_block];
+    half  max[quantize_block];
+
+    unroll_for (uint i = 0 ; i < quantize_block; ++i) {
+        input_0[i] = vload4(0, &input[input_offset + i * 4]);
+        max[i] = fmax(fmax(fabs(input_0[i][0]), fabs(input_0[i][1])), fmax(fabs(input_0[i][2]), fabs(input_0[i][3])));
+    }
+
+    half max_value = fmax(0.001h, max[0]);
+    for (uint i = 1; i < quantize_block; i++) {
+        max_value = fmax(max_value, max[i]);
+    }
+
+    half quan_scale = 128.0h / max_value;
+
+    unroll_for (uint i = 0 ; i < quantize_block; ++i) {
+        quantized_value[i] = convert_char4(input_0[i] * (half4)quan_scale);
+        vstore4(quantized_value[i], 0, &output[output_offset + i * 4]);
+    }
+
+#if OUTPUT_DIMS == 2
+    output_scale[OUTPUT1_GET_INDEX(b, f_grp, 0, 0)] = 1.0h / quan_scale;
+#else
+    output_scale[OUTPUT1_GET_INDEX(b, f, y_grp, 0)] = 1.0h / quan_scale;
+#endif
+}
+
+#else // !(QUANTIZE_GROUP_SIZE <= 128)
+
 REQD_SUB_GROUP_SIZE(SIMD)
 KERNEL(dynamic_quantize_gpu_opt)(
     OPTIONAL_SHAPE_INFO_ARG
     const __global INPUT0_TYPE* input,
     __global OUTPUT_TYPE* output,
-    __global OUTPUT1_TYPE* output_scale)
+    __global OUTPUT1_TYPE* output_scale
+#if ASYMMETRIC_QUANTIZATION
+    , __global OUTPUT2_TYPE* output_zp
+#endif
+    )
 {
     const uint bf = (uint)get_global_id(2);
     const uint sglid = get_sub_group_local_id();
     const uint local_id = (uint)get_local_id(1);
 
     const uint block_size = SIMD * VEC_SIZE;
+#if OUTPUT_DIMS == 2
+    const uint b_offset = bf * INPUT0_BATCH_PITCH;
+#else
     const uint b_offset = bf * INPUT0_FEATURE_PITCH;
-
+#endif
     const uint offset = b_offset + VEC_SIZE * sglid;
 
     const uint iteration = ALIGNED_BLOCK_NUM / BLOCK_NUM;
 
-    __local half local_mem[BLOCK_NUM];
+    __local half local_mem_max[BLOCK_NUM];
+    __local half local_mem_min[BLOCK_NUM];
 
     MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) val[iteration];
     MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) abs_val;
-    half max = 0.0h;
     half grp_max = 0.001h;
-    half max_value;
+    half grp_min = 0.001h;
+    half max_value = 0.0h;
+    half min_value = 0.0h;
 
     unroll_for(int i = 0; i < iteration; ++i) {
         if ((local_id * iteration + i) >= TOTAL_BLOCK_NUM)
             continue;
 
         val[i] = AS_INPUT_TYPE_N(VLOAD_N(0, input + offset + ((local_id * iteration + i) * block_size)));
-        abs_val = fabs(val[i]);
-
+#if ASYMMETRIC_QUANTIZATION
         unroll_for (int j = 0; j < VEC_SIZE; j++) {
-            max = fmax(max, abs_val[j]);
+            max_value = fmax(max_value, val[i][j]);
+            min_value = fmin(min_value, val[i][j]);
         }
+        grp_max = fmax(grp_max, max_value);
+        grp_min = fmin(grp_min, min_value);
+#else
+        abs_val = fabs(val[i]);
+
+        unroll_for (int j = 0; j < VEC_SIZE; j++)
+            max_value = fmax(max_value, abs_val[j]);
 
-        grp_max = fmax(grp_max, max);
+        grp_max = fmax(grp_max, max_value);
+#endif
     }
 
     max_value = sub_group_reduce_max(grp_max);
-    if (sglid == 0)
-        local_mem[local_id] = max_value;
+#if ASYMMETRIC_QUANTIZATION
+    min_value = sub_group_reduce_min(grp_min);
+#endif
+
+    if (sglid == 0) {
+        local_mem_max[local_id] = max_value;
+#if ASYMMETRIC_QUANTIZATION
+        local_mem_min[local_id] = min_value;
+#endif
+    }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
     for (int j = 0; j < BLOCK_NUM; j++) {
-        max_value = fmax(max_value, local_mem[j]);
+        max_value = fmax(max_value, local_mem_max[j]);
+#if ASYMMETRIC_QUANTIZATION
+        min_value = fmin(min_value, local_mem_min[j]);
+#endif
     }
 
-    half scale = 127.0h / max_value;
+#if ASYMMETRIC_QUANTIZATION
+    OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value));
+    OUTPUT2_TYPE zp = (OUTPUT2_TYPE)(-min_value * scale);
+#else
+    OUTPUT1_TYPE scale = 127.0h / max_value;
+#endif
+
 
     unroll_for(int i = 0; i < iteration; ++i) {
         if ((local_id * iteration + i) >= TOTAL_BLOCK_NUM)
             continue;
 
         val[i] *= scale;
+#if ASYMMETRIC_QUANTIZATION
+        val[i] += zp;
+        VSTORE_N(CAT(CONVERT_UCHAR_N, _rte)(val[i]), 0, output + offset + ((local_id * iteration + i) * block_size));
+#else
         VSTORE_N(CAT(CONVERT_CHAR_N, _rte)(val[i]), 0, output + offset + ((local_id * iteration + i) * block_size));
+#endif
     }
 
-    if (sglid == 0 && local_id == 0)
+    if (sglid == 0 && local_id == 0) {
         output_scale[bf] = 1.0h / scale;
+#if ASYMMETRIC_QUANTIZATION
+        output_zp[bf] = convert_uchar_rte(zp);
+#endif
+    }
 }
+#endif  // QUANTIZE_GROUP_SIZE <= 128
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl
index 62482b8b9b5047..4acf87eb37ceb0 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl
@@ -4,6 +4,16 @@
 
 #include "include/batch_headers/fetch_data.cl"
 
+#define UINT64_MAX 0xFFFFFFFFFFFFFFFF
+
+#if ASYMMETRIC_QUANTIZATION && UNSIGNED_OUTPUT
+    #define TO_OUTPUT_TYPE_RTE(val)  convert_uchar_rte(val)
+    #define TO_OUTPUT_VEC_TYPE_RTE(val)  convert_uchar8_rte(val)
+#else
+    #define TO_OUTPUT_TYPE_RTE(val)  convert_char_rte(val)
+    #define TO_OUTPUT_VEC_TYPE_RTE(val)  convert_char8_rte(val)
+#endif
+
 #if OUTPUT_DIMS != 4
 #error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension"
 #endif
@@ -33,19 +43,21 @@ KERNEL(dynamic_quantize_gpu_ref)(
     const uint bf = (uint)get_global_id(0);
     const uint b = bf / INPUT0_FEATURE_NUM;
     const uint f = bf % INPUT0_FEATURE_NUM;
-    const uint y = (uint)get_global_id(1);
+    const uint out_y = (uint)get_global_id(1);
+    const uint y = out_y * GROUP_SIZE_DIM2;     // quantization may be grouped for y axis
     const uint x = (uint)get_global_id(2);
 #ifdef SCALES_OUTPUT_ORDER
-    const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x);
+    const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, out_y, x);
 #else
-    const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, y, x);
+    const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, out_y, x);
 #endif
 
     half max_val = INPUT0_VAL_MIN;
     half min_val = INPUT0_VAL_MAX;
     for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) {
     for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) {
-    for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) {
+    for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == UINT64_MAX ? INPUT0_SIZE_Y : GROUP_SIZE_DIM2); y_off++) {
+        // It is assumed that grouped quantization happens only for 3d input case where we don't have x axis
 #if GROUP_SIZE_DIM3 == 1
         const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x);
         half val = input[offset];
@@ -88,53 +100,49 @@ KERNEL(dynamic_quantize_gpu_ref)(
 
 #if ASYMMETRIC_QUANTIZATION
     OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_val - min_val));
+#   if UNSIGNED_OUTPUT
+    OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_val * scale);
+#   else // !UNSIGNED_OUTPUT
     OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_val * scale) - CHAR_MAX;
-#else
+#   endif
+#else  // !ASYMMETRIC_QUANTIZATION
     max_val = work_group_reduce_max(max_val);
     OUTPUT1_TYPE scale = 127.0h / max_val;
 #endif
 
     for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) {
     for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) {
-    for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) {
+    for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == UINT64_MAX ? INPUT0_SIZE_Y : GROUP_SIZE_DIM2); y_off++) {
 #if GROUP_SIZE_DIM3 == 1
         const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x);
         const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, x);
 
         half val = input[in_offset];
-#if ASYMMETRIC_QUANTIZATION
         val *= scale;
+#if ASYMMETRIC_QUANTIZATION
         val += zp;
-        output[out_offset] = convert_char_rte(val);
-#else
-        val *= scale;
-        output[out_offset] = convert_char_rte(val);
 #endif
+        output[out_offset] = TO_OUTPUT_TYPE_RTE(val);
 #else
         const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0);
         const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, 0);
         int x;
         for (x = 0; x < INPUT0_SIZE_X / 8; x++) {
             half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8));
-#if ASYMMETRIC_QUANTIZATION
             val *= scale;
+#if ASYMMETRIC_QUANTIZATION
             val += zp;
-#else
-            val *= scale;
 #endif
-            vstore8(convert_char8_rte(val), 0, output + out_offset + x * 8);
+            vstore8(TO_OUTPUT_VEC_TYPE_RTE(val), 0, output + out_offset + x * 8);
         }
         x *= 8;
         for (; x < INPUT0_SIZE_X; x++) {
             half val = input[in_offset + x];
-#if ASYMMETRIC_QUANTIZATION
             val *= scale;
+#if ASYMMETRIC_QUANTIZATION
             val += zp;
-            output[out_offset + x] = convert_char_rte(val);
-#else
-            val *= scale;
-            output[out_offset + x] = convert_char_rte(val);
 #endif
+            output[out_offset + x] = TO_OUTPUT_TYPE_RTE(val);
         }
 #endif
     }
@@ -145,6 +153,6 @@ KERNEL(dynamic_quantize_gpu_ref)(
 #if ASYMMETRIC_QUANTIZATION && GROUP_SCALES_WITH_ZP
     output_scale[scale_idx + 1] = zp;
 #elif ASYMMETRIC_QUANTIZATION
-    output_zp[scale_idx] = zp;
+    output_zp[scale_idx] = convert_uchar_rte(zp);
 #endif
 }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp
index 52a648679499f2..b4f667475f26f1 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp
@@ -30,9 +30,11 @@ static std::pair<size_t, size_t> get_input_bf_size(const dynamic_quantize_params
 
 static size_t get_match_vector_size(const dynamic_quantize_params& params) {
     auto block_sizes = { 8, 4, 2 };
+    auto bf = get_input_bf_size(params);
+    auto f = bf.second;
 
     for (auto block_size : block_sizes) {
-        if (((params.inputs[0].X().v * params.inputs[0].Y().v) / simd) % block_size == 0) {
+        if ((f / simd) % block_size == 0) {
             return block_size;
         }
     }
@@ -43,10 +45,13 @@ static size_t get_match_vector_size(const dynamic_quantize_params& params) {
 ParamsKey DynamicQuantizeKernelOpt::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::UINT8);
     k.EnableOutputDataType(Datatype::INT8);
     k.EnableDifferentTypes();
-    k.EnableAllInputLayout();
-    k.EnableAllOutputLayout();
+    k.EnableInputLayout(DataLayout::bf);
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bf);
+    k.EnableOutputLayout(DataLayout::bfyx);
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();
@@ -68,6 +73,8 @@ JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_pa
     jit.AddConstant(MakeJitConstant("TOTAL_BLOCK_NUM", total_block_num));
     jit.AddConstant(MakeJitConstant("ALIGNED_BLOCK_NUM", aligned_block_num));
     jit.AddConstant(MakeJitConstant("BLOCK_NUM", block_num));
+    jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", params.group_sizes.back()));
+    jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization));
     jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0]));
 
     return jit;
@@ -76,15 +83,20 @@ JitConstants DynamicQuantizeKernelOpt::GetJitConstants(const dynamic_quantize_pa
 CommonDispatchData DynamicQuantizeKernelOpt::SetDefault(const dynamic_quantize_params& params) const {
     CommonDispatchData dispatchData;
 
-    auto vec_size = get_match_vector_size(params);
-    auto bf_size = get_input_bf_size(params);
-    size_t total_block_num = bf_size.second / (simd * vec_size);
-    size_t batch = get_input_bf_size(params).first;
-    size_t block_num = (total_block_num > 32) ? 32 : total_block_num;
-
-    dispatchData.gws = {simd, block_num, batch};
-    dispatchData.lws = {simd, block_num, 1};
-
+    if (params.group_sizes.back() <= 128) {
+        auto bf_size = get_input_bf_size(params);
+        dispatchData.gws = {bf_size.first, bf_size.second / params.group_sizes.back(), 1};
+        dispatchData.lws = {1, 1, 1};
+    } else {
+        auto vec_size = get_match_vector_size(params);
+        auto bf_size = get_input_bf_size(params);
+        size_t total_block_num = bf_size.second / (simd * vec_size);
+        size_t batch = get_input_bf_size(params).first;
+        size_t block_num = (total_block_num > 32) ? 32 : total_block_num;
+
+        dispatchData.gws = {simd, block_num, batch};
+        dispatchData.lws = {simd, block_num, 1};
+    }
     return dispatchData;
 }
 
@@ -147,8 +159,9 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const {
 
     const auto& dq_params = static_cast<const dynamic_quantize_params&>(params);
 
-    // Todo : Add proper exception here
-    if (((dq_params.inputs[0].X().v * dq_params.inputs[0].Y().v) % (simd * 2)) != 0)
+
+    auto bf = get_input_bf_size(dq_params);
+    if (((bf.second) % (simd * 2)) != 0)
         return false;
 
     if (dq_params.inputs[0].GetPaddedVal() != 0 || dq_params.outputs[0].GetPaddedVal() != 0)
@@ -157,8 +170,10 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const {
     if (dq_params.append_axis != -1)
         return false;
 
-    if (dq_params.group_sizes.back() != UINT64_MAX)
-        return false;
+    for (size_t i = 0; i < dq_params.group_sizes.size() - 1; i++) {
+        if (dq_params.group_sizes[i] != 1)
+            return false;
+    }
 
     // Allow only default scales order
     const auto& scales_output_order = dq_params.scales_output_order;
@@ -168,7 +183,16 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const {
                 return false;
     }
 
+    if (dq_params.use_asymmetric_quantization) {
+        if (dq_params.combine_scales_and_zp)
+            return false;
+        if (dq_params.outputs[0].GetDType() != Datatype::UINT8)
+            return false;
+    }
+
     return true;
 }
+
+
 }  // namespace kernel_selector
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp
index bd3d0f87cdc931..f432fa6ac5756d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp
@@ -11,6 +11,7 @@ ParamsKey DynamicQuantizeKernelRef::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
     k.EnableInputLayout(DataLayout::bfyx);
     k.EnableOutputLayout(DataLayout::bfyx);
     k.EnableTensorOffset();
@@ -53,6 +54,7 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa
 
     jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization));
     jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp));
+    jit.AddConstant(MakeJitConstant("UNSIGNED_OUTPUT", params.outputs[0].GetDType() == Datatype::UINT8 ? 1 : 0));
 
     auto group_sizes = params.group_sizes;
     group_sizes.resize(std::min((size_t)4, group_sizes.size()), 1);
@@ -71,12 +73,26 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p
     OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor");
 
     auto group_sizes = params.group_sizes;
-    group_sizes.resize(std::min((size_t)4, group_sizes.size()), 1);
+    group_sizes.resize(std::max((size_t)4, group_sizes.size()), 1);
     auto batch_size = group_sizes[0] == 1 ? params.outputs[0].Batch().v : 1;
     auto feature_size = group_sizes[1] == 1 ? params.outputs[0].Feature().v : 1;
     auto y_size = group_sizes[2] == 1 ? params.outputs[0].Y().v : 1;
     auto x_size = group_sizes[3] == 1 ? params.outputs[0].X().v : 1;
 
+    OPENVINO_ASSERT(
+        (group_sizes[0] == 1 || group_sizes[0] == params.outputs[0].Batch().v   || group_sizes[0] == UINT64_MAX) &&
+        (group_sizes[1] == 1 || group_sizes[1] == params.outputs[0].Feature().v || group_sizes[1] == UINT64_MAX) &&
+        (group_sizes[2] == 1 || group_sizes[2] == params.outputs[0].Y().v       || group_sizes[2] == UINT64_MAX
+                || (params.outputs[0].Y().v % group_sizes[2] == 0 && params.outputs[0].X().v == 1)) &&   // Grouped quantization is only supported for 3d case
+        (group_sizes[3] == 1 || group_sizes[3] == params.outputs[0].X().v       || group_sizes[3] == UINT64_MAX),
+                    "[GPU] Unsupported dynamic quantization configuration: (",
+                            group_sizes[0], ",", group_sizes[1], ",", group_sizes[2], ",", group_sizes[3], ") - (",
+                            params.outputs[0].Batch().v, ",", params.outputs[0].Feature().v, ",", params.outputs[0].Y().v, ",", params.outputs[0].X().v, ")");
+
+    // Grouped quantization is supported only over y axis
+    if (params.group_sizes[2] > 1 && params.group_sizes[2] != UINT64_MAX)
+        y_size = params.outputs[0].Y().v / params.group_sizes[2];
+
     dispatchData.gws = {batch_size * feature_size, y_size, x_size};
     dispatchData.lws = {1, 1, 1};
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
index 46e8f7f1104f0d..68da7aea7b1fe6 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -124,16 +124,16 @@ static bool should_dynamic_quantize(const fully_connected_params& params, bool p
     if ((scale_group_size % simd == 0) && (input_f % dynamic_quantization_group_size == 0) &&
         (params.is_shape_agnostic || (params.inputs[0].Batch().v > 1 && input_b > min_slm_size)) &&
         params.inputs[0].GetDType() == Datatype::F16 && is_weight_dyn_quantizable(params)) {
-            if (print_log) {
-                GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size: " << scale_group_size <<
-                    ", Dyn-quan group size: " << dynamic_quantization_group_size <<
-                    ", Type(I:" << kernel_selector::toString(params.inputs[0].GetDType()) <<
-                    ", O:" << kernel_selector::toString(params.outputs[0].GetDType()) <<
-                    ", W:" << kernel_selector::toString(params.weights.GetDType()) <<
-                    "), Format(W:" << kernel_selector::toString(params.weights.GetLayout()) <<
-                    ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v <<
-                    ", Y: " << params.inputs[0].Y().v << std ::endl;
-            }
+        if (print_log) {
+            GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size: " << scale_group_size <<
+                ", Dyn-quan group size: " << dynamic_quantization_group_size <<
+                ", Type(I:" << kernel_selector::toString(params.inputs[0].GetDType()) <<
+                ", O:" << kernel_selector::toString(params.outputs[0].GetDType()) <<
+                ", W:" << kernel_selector::toString(params.weights.GetDType()) <<
+                "), Format(W:" << kernel_selector::toString(params.weights.GetLayout()) <<
+                ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v <<
+                ", Y: " << params.inputs[0].Y().v << std ::endl;
+        }
         return true;
     }
 
diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp
index 85f28cbd711678..4c11bdb21971e9 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp
@@ -18,7 +18,8 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptr<ov:
 
     auto prim = cldnn::dynamic_quantize(primitive_name,
                                         inputs[0],
-                                        op->get_attrs());
+                                        op->get_attrs(),
+                                        op->get_input_partial_shape(0).size());
 
     prim.num_outputs = op->get_output_size();
 
diff --git a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp
index 7b0aa921ef3ad5..5f4fe19c5c4c08 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp
@@ -26,7 +26,7 @@ namespace ov {
 namespace intel_gpu {
 
 static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::shared_ptr<op::FullyConnectedCompressed>& op) {
-    validate_inputs_count(op, {4, 5, 6});
+    validate_inputs_count(op, {4, 5, 6, 7});
     auto inputs = p.GetInputInfo(op);
     std::string primitive_name = layer_type_name_ID(op);
     auto supports_immad = p.get_engine().get_device_info().supports_immad;
@@ -39,6 +39,7 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share
     const size_t W_ZP_IDX = input_idx;
     std::string zp_name = op->get_input_size() > input_idx ? inputs[input_idx++].pid : "";
     auto activation_scale_input = op->get_input_size() > input_idx ? inputs[input_idx++] : cldnn::input_info();
+    auto activation_zero_point_input = op->get_input_size() > input_idx ? inputs[input_idx++] : cldnn::input_info();
 
     float zp_value = 0.0f;
     bool has_scalar_zp = false;
@@ -58,6 +59,7 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share
                                      scale_name,
                                      has_scalar_zp && !supports_immad ? "" : zp_name,
                                      activation_scale_input,
+                                     activation_zero_point_input,
                                      cldnn::element_type_to_data_type(op->get_output_element_type(0)),
                                      op->get_input_partial_shape(0).size(),
                                      op->get_input_partial_shape(1).size());
diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp
index b623c86fabe02c..368e25abe2ddac 100644
--- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp
+++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp
@@ -10,6 +10,7 @@
 #include "openvino/op/lstm_sequence.hpp"
 #include "openvino/op/loop.hpp"
 #include "openvino/op/search_sorted.hpp"
+#include "ov_ops/dynamic_quantize.hpp"
 
 #include "intel_gpu/plugin/common_utils.hpp"
 #include "intel_gpu/plugin/program_builder.hpp"
@@ -357,6 +358,9 @@ bool ProgramBuilder::requires_new_shape_infer(const std::shared_ptr<ov::Node>& o
     if (ov::is_type<ov::op::v15::SearchSorted>(op))
         return true;
 
+    if (ov::is_type<ov::op::internal::DynamicQuantize>(op))
+        return true;
+
     if (ov::is_type<ov::op::v5::Loop>(op)) {
         const auto body_function = std::static_pointer_cast<ov::op::v5::Loop>(op)->get_function();
         if (body_function->is_dynamic())
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp
index c36212713ae717..61dc40e2713800 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp
@@ -21,24 +21,11 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size
     : ov::pass::MatcherPass() {
     GPU_DEBUG_GET_INSTANCE(debug_config);
     using namespace ov::pass::pattern;
-
-    // per-token quantization is supported
-    if (group_size != UINT64_MAX) {
-        GPU_DEBUG_TRACE << "Dynamic quantization is disabled " << group_size << std::endl;
-        return;
-    }
-    auto is_dynamic = [](const ov::Output<ov::Node>& output) -> bool {
-        bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic();
-        size_t num_inputs = output.get_node_shared_ptr()->get_input_size();
-        for (size_t idx = 0; idx < num_inputs; idx++) {
-            is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic();
-        }
-        return is_dynamic;
-    };
+    using QuantizationType = ov::op::internal::DynamicQuantize::QuantizationType;
 
     auto data = any_input();
-    auto fully_connected_compressed3 = wrap_type<op::FullyConnectedCompressed>({data, any_input(), any_input(), any_input()}, is_dynamic);
-    auto fully_connected_compressed4 = wrap_type<op::FullyConnectedCompressed>({data, any_input(), any_input(), any_input(), any_input()}, is_dynamic);
+    auto fully_connected_compressed3 = wrap_type<op::FullyConnectedCompressed>({data, any_input(), any_input(), any_input()});
+    auto fully_connected_compressed4 = wrap_type<op::FullyConnectedCompressed>({data, any_input(), any_input(), any_input(), any_input()});
     auto fully_connected_compressed = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{fully_connected_compressed3, fully_connected_compressed4});
 
 
@@ -65,12 +52,20 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size
 
         ov::op::internal::DynamicQuantize::Attributes config;
         config.quantization_dt = element::i8;
-        config.quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Symmetric;
+        config.quantization_type = QuantizationType::Symmetric;
         config.scale_dt = element::f16;
         config.group_sizes = shape_group_size;
 
+        if (debug_config->dynamic_quantize_asym) {
+            config.quantization_type = QuantizationType::Asymmetric;
+            config.quantization_dt = element::u8;
+            config.zp_dt = element::u8; // it supports u8 only now
+        }
+
         auto dyn_quan = std::make_shared<ov::op::internal::DynamicQuantize>(m_data, config);
         auto optional_w_zp = m_fc->get_input_size() > 4 ? m_fc->get_input_node_shared_ptr(4) : std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto optional_a_zp = config.quantization_type == QuantizationType::Symmetric ?
+                                std::make_shared<ov::intel_gpu::op::Placeholder>() : dyn_quan->output(2);
 
         auto output_type = m_fc->get_output_type();
         if (output_type == ov::element::undefined)
@@ -82,6 +77,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size
                                                                      m_fc->get_input_node_shared_ptr(3),
                                                                      optional_w_zp,
                                                                      dyn_quan->output(1),
+                                                                     optional_a_zp,
                                                                      output_type);
 
         ov::replace_node(m_fc, new_fc);
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp
index 2e3819d7e850ee..dd5c555b1e6bc8 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/op/fully_connected_compressed.cpp
@@ -14,11 +14,13 @@ FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output<Node>& A,
                                                    const ov::Output<Node>& w_decompression_scale,
                                                    const ov::Output<Node>& w_decompression_zero_point,
                                                    const ov::Output<Node>& a_decompression_scale,
+                                                   const ov::Output<Node>& a_decompression_zero_point,
                                                    const ov::element::Type output_type)
     : FullyConnected(A, B, bias, output_type) {
     set_argument(3, w_decompression_scale);
     set_argument(4, w_decompression_zero_point);
     set_argument(5, a_decompression_scale);
+    set_argument(6, a_decompression_zero_point);
     validate_and_infer_types();
 }
 
@@ -60,12 +62,13 @@ std::shared_ptr<ov::Node> FullyConnectedCompressed::clone_with_new_inputs(const
                                                           new_args.at(3),
                                                           new_args.at(4),
                                                           m_output_type);
-    else if (new_args.size() == 6)
+    else if (new_args.size() == 7)
         return std::make_shared<FullyConnectedCompressed>(new_args.at(0),
                                                           new_args.at(1),
                                                           new_args.at(2),
                                                           new_args.at(3),
                                                           new_args.at(4),
+                                                          new_args.at(5),
                                                           new_args.at(6),
                                                           m_output_type);
     else
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index e47ccbb09a9c43..50eecf51b945b7 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -975,18 +975,34 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         // This Validate is needed for proper data type propagation after applying IncreasePositionIdsPrecision pass
         manager.register_pass<ov::pass::Validate>();
 
-        auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size);
         if (device_info.supports_immad) {
+            auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size);
             pass_config->set_callback<ov::intel_gpu::DynamicQuantizeFullyConnected>([=](const_node_ptr& root) -> bool {
                 if (root->get_input_node_shared_ptr(0)->get_element_type() == ov::element::Type_t::f32) {
-                    GPU_DEBUG_TRACE << root->get_friendly_name() << "  Dynamic quantization is turned off because input type is not supported" << std::endl;
+                    GPU_DEBUG_TRACE << root->get_friendly_name() << "  dyn_quan is turned off: input type is not supported" << std::endl;
                     return true;
                 }
 
                 auto weight_shape = root->get_input_partial_shape(1);
                 const size_t innermost_size = weight_shape[weight_shape.size() - 1].get_length();
                 if (innermost_size < 32) {
-                    GPU_DEBUG_TRACE << "Dynamic quantization: shape is too small " << innermost_size << " / " << dynamic_quantization_group_size << std::endl;
+                    GPU_DEBUG_TRACE << root->get_friendly_name() << "  dyn_quan is turned off: shape is too small - " << innermost_size << std::endl;
+                    return true;
+                }
+
+                // AZP does not support 8bit weight
+                if (debug_config->dynamic_quantize_asym
+                    && (root->get_input_element_type(1) == ov::element::i8 || root->get_input_element_type(1) == ov::element::u8)) {
+                    GPU_DEBUG_TRACE << root->get_friendly_name() << "  dyn_quan is turned off: asym quantization does not support 8bit weight" << std::endl;
+                    return true;
+                }
+
+                bool has_wzp = root->get_input_size() > 4;
+                if ((root->get_input_element_type(1) == ov::element::i8 || root->get_input_element_type(1) == ov::element::u8)
+                    && has_wzp
+                    && dynamic_quantization_group_size != UINT64_MAX) {
+                    GPU_DEBUG_TRACE << root->get_friendly_name() << "  dyn_quan is turned off:"
+                                                                    " asym 8bit weight does not support grouped quantization" << std::endl;
                     return true;
                 }
                 return false;
diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
index 65ca31f16c720c..380480dccc68bf 100644
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@@ -190,6 +190,7 @@ static void print_help_messages() {
                                 "separated by space. Support case-insensitive and regular expression. For example .*fully_connected.*");
     message_list.emplace_back("OV_GPU_DynamicQuantizeGroupSize", "Specify a group size of dynamic quantization to enable "
                               "dynamic quantization for Fully-connected primitive.");
+    message_list.emplace_back("OV_GPU_DynamicQuantizeAsym", "Enable asymmetric dynamic quantization when set as 1.");
     message_list.emplace_back("OV_GPU_DisableHorizontalFCFusion", "Disable horizontal fc fusion");
     message_list.emplace_back("OV_GPU_DisableFCSwigluFusion", "Disable fc + swiglu fusion");
     message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space.");
@@ -260,6 +261,7 @@ debug_configuration::debug_configuration()
         , use_usm_host(0)
         , use_kv_cache_compression(-1)
         , dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET)
+        , dynamic_quantize_asym(0)
         , disable_horizontal_fc_fusion(0)
         , disable_fc_swiglu_fusion(0) {
 #ifdef GPU_DEBUG_CONFIG
@@ -315,6 +317,7 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("UseUsmHost", use_usm_host);
     get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression);
     get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size);
+    get_gpu_debug_env_var("DynamicQuantizeAsym", dynamic_quantize_asym);
     get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion);
     get_gpu_debug_env_var("DisableFCSwigluFusion", disable_fc_swiglu_fusion);
     std::string dump_iteration_str;
diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
index 30a9477e1600dd..804ad81f2d3735 100644
--- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp
+++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
@@ -57,7 +57,7 @@ void ExecutionConfig::set_default() {
         std::make_tuple(ov::internal::query_model_ratio, 1.0f),
         std::make_tuple(ov::cache_mode, ov::CacheMode::OPTIMIZE_SPEED),
         std::make_tuple(ov::cache_encryption_callbacks, EncryptionCallbacks{}),
-        std::make_tuple(ov::hint::dynamic_quantization_group_size, 32),
+        std::make_tuple(ov::hint::dynamic_quantization_group_size, 0),
         std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined),
         std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false),
         std::make_tuple(ov::weights_path, ""),
@@ -254,6 +254,11 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
         set_property(ov::hint::kv_cache_precision(ov::element::i8));
     }
 
+    // Enable dynamic quantization by default for non-systolic platforms
+    if (!is_set_by_user(ov::hint::dynamic_quantization_group_size) && !info.supports_immad) {
+        set_property(ov::hint::dynamic_quantization_group_size(32));
+    }
+
     user_properties.clear();
 }
 
diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp
index 27c57aa072878d..b430884decb71a 100644
--- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp
+++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp
@@ -58,7 +58,8 @@ using MatmulWeightsDecompressionParams = std::tuple<ShapeParams,              //
                                                     bool,                     // reshape on decompression constants
                                                     bool,                     // extra multiply
                                                     bool,                     // per-tensor zero-point
-                                                    uint64_t                  // dynamic_quantization_group_size
+                                                    uint64_t,                 // dynamic_quantization_group_size
+                                                    float                     // abs_threshold_f16
                                                     >;
 
 class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeightsDecompressionParams>,
@@ -74,6 +75,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeig
         bool extra_multiply;
         bool per_tensor_zp;
         uint64_t dyn_quan_group_size;
+        float abs_threshold_f16;
 
         std::tie(shape_params,
                  weights_precision,
@@ -83,7 +85,8 @@ class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeig
                  reshape_on_decompression,
                  extra_multiply,
                  per_tensor_zp,
-                 dyn_quan_group_size) = obj.param;
+                 dyn_quan_group_size,
+                 abs_threshold_f16) = obj.param;
 
         std::ostringstream result;
         result << "data_shape=";
@@ -254,6 +257,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeig
         bool extra_multiply;
         bool per_tensor_zp;
         uint64_t dyn_quan_group_size;
+        float abs_threshold_f16 = 1.0f;
 
         std::tie(shape_params,
                  weights_precision,
@@ -263,7 +267,8 @@ class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeig
                  reshape_on_decompression,
                  extra_multiply,
                  per_tensor_zp,
-                 dyn_quan_group_size) = GetParam();
+                 dyn_quan_group_size,
+                 abs_threshold_f16) = GetParam();
 
         init_input_shapes({shape_params.data_shape, {{}, {{shape_params.weights_shape}}}});
 
@@ -282,7 +287,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeig
 
 
         if (activations_precision == ov::element::f16) {
-            abs_threshold = 1.0f;
+            abs_threshold = abs_threshold_f16;
         } else {
             abs_threshold = 1e-4f;
         }
@@ -297,7 +302,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeig
                 const auto& model_input = model_inputs[i];
                 ov::test::utils::InputGenerateData in_data;
                 in_data.start_from = -1;
-                in_data.range = 2;
+                in_data.range = 3;
                 in_data.resolution = 10000;
                 ov::Tensor tensor = ov::test::utils::create_and_fill_tensor(model_input.get_element_type(), target_input_static_shapes[i], in_data);
                 inputs.insert({model_input.get_node_shared_ptr(), tensor});
@@ -341,7 +346,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_basic,
                                             ::testing::Values(true),
                                             ::testing::Values(false),
                                             ::testing::Values(false),
-                                            ::testing::Values(0)),
+                                            ::testing::Values(0),
+                                            ::testing::Values(1.0f)),
                          MatmulWeightsDecompression::get_test_case_name);
 
 INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_extra_multiply,
@@ -354,7 +360,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_extra_multiply,
                                             ::testing::Values(false),
                                             ::testing::Values(true),
                                             ::testing::Values(false),
-                                            ::testing::Values(0)),
+                                            ::testing::Values(0),
+                                            ::testing::Values(1.0f)),
                          MatmulWeightsDecompression::get_test_case_name);
 
 const std::vector<ShapeParams> input_shapes_corner_cases_basic = {
@@ -384,7 +391,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_corner_cases_basic,
                                             ::testing::ValuesIn(reshape_on_decompression),
                                             ::testing::Values(false),
                                             ::testing::ValuesIn(per_tensor_zp),
-                                            ::testing::Values(0)),
+                                            ::testing::Values(0),
+                                            ::testing::Values(1.0f)),
                          MatmulWeightsDecompression::get_test_case_name);
 
 INSTANTIATE_TEST_SUITE_P(MatMulCompressedWeights_corner_cases_big,
@@ -397,16 +405,18 @@ INSTANTIATE_TEST_SUITE_P(MatMulCompressedWeights_corner_cases_big,
                                             ::testing::ValuesIn(reshape_on_decompression),
                                             ::testing::Values(false),
                                             ::testing::ValuesIn(per_tensor_zp),
-                                            ::testing::Values(0)),
+                                            ::testing::Values(0),
+                                            ::testing::Values(1.0f)),
                          MatmulWeightsDecompression::get_test_case_name);
 
 
 // per_tensor_zp=0 is not supported
 // transpose_weights is not supported
 // weight precision u4 is only supported
+const std::vector<uint64_t> group_size = {32, 128, UINT64_MAX};
 INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_dyn_quan,
                          MatmulWeightsDecompression,
-                         ::testing::Combine(::testing::Values(ShapeParams{{{-1, -1, 4096}, {{1, 1, 4096}}}, {1, 4096, 4096}}),  // shape
+                         ::testing::Combine(::testing::Values(ShapeParams{{{-1, -1, 4096}, {{1, 1, 4096}}}, {4096, 4096}, 128}),  // shape
                                             ::testing::Values(ov::element::u4),
                                             ::testing::Values(ov::element::f16),
                                             ::testing::Values(false),
@@ -414,7 +424,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_dyn_quan,
                                             ::testing::Values(true),
                                             ::testing::Values(false),
                                             ::testing::Values(true),  // per_tensor_zp
-                                            ::testing::Values(UINT64_MAX)),
+                                            ::testing::ValuesIn(group_size),
+                                            ::testing::Values(2.0f)),   // Note: this is because of potential cldnn accuracy issue
                          MatmulWeightsDecompression::get_test_case_name);
 
 } // namespace
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp
index c0e317ff6ce915..32e0533b662746 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp
@@ -23,6 +23,7 @@
 using namespace cldnn;
 using namespace ::tests;
 using QuantizationType = ov::op::internal::DynamicQuantize::QuantizationType;
+using OutputStorageType = ov::op::internal::DynamicQuantize::OutputStorageType;
 
 class dynamic_quantization_gpu_tests: public ::testing::Test {
 public:
@@ -31,6 +32,10 @@ class dynamic_quantization_gpu_tests: public ::testing::Test {
                                    const ov::PartialShape& input_shape,
                                    const ov::Shape& data_shape,
                                    const QuantizationType quantization_type = QuantizationType::Symmetric,
+                                   uint64_t group_size = UINT64_MAX,
+                                   data_types quant_dt = data_types::i8,
+                                   data_types zp_dt = data_types::undefined,
+                                   OutputStorageType storage_type = OutputStorageType::Planar,
                                    const std::string& impl_name = "") {
         tests::random_generator rg(GET_SUITE_NAME);
         auto& engine = get_test_engine();
@@ -40,9 +45,9 @@ class dynamic_quantization_gpu_tests: public ::testing::Test {
         auto scales_ps = ov::PartialShape::dynamic(dyn_input_ps.size());
         auto input_mem = engine.allocate_memory({ input_ps, data_types::f32, format::bfyx });
         auto group_sizes = std::vector<uint64_t>(dyn_input_ps.size(), 1);
-        group_sizes.back() = UINT64_MAX;
+        group_sizes.back() = group_size;
 
-        auto input_data = rg.generate_random_1d<float>(ov::shape_size(data_shape), -16.0f, 16.0f);
+        auto input_data = rg.generate_random_1d<float>(ov::shape_size(data_shape), -16.0f, 20.0f);
         set_values(input_mem, input_data);
 
         auto in_layout_f32 = input_shape.is_dynamic() ? layout{ dyn_input_ps, data_types::f32, format::bfyx }
@@ -53,17 +58,15 @@ class dynamic_quantization_gpu_tests: public ::testing::Test {
 
         dynamic_quantize::Attributes dq_config;
         dq_config.quantization_type = quantization_type;
-        dq_config.quantization_dt = data_types::i8;
+        dq_config.quantization_dt = quant_dt;
         dq_config.scale_dt = data_types::f16;
-        dq_config.zp_dt = data_types::undefined;
+        dq_config.zp_dt = zp_dt;
         dq_config.group_sizes = group_sizes;
-        dq_config.scales_zp_output_order = { 0, 1, 2, 3 };
-        dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::Planar;
+        dq_config.scales_zp_output_order = { 0, 1, 2};
 
-        if (quantization_type == QuantizationType::Asymmetric) {
-            dq_config.zp_dt = data_types::f16;
-            dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP;
-        }
+        if (data_shape.size() == 4)
+            dq_config.scales_zp_output_order.emplace_back(3);
+        dq_config.output_storage_type = storage_type;
 
         auto reorder_1 = reorder("reorder_1", input_info("input"), layout{ input_ps, data_types::f16, format::bfyx });
         auto dyn_quan_prim = dynamic_quantize("dyn_quan_prim", input_info("reorder_1"), dq_config);
@@ -156,6 +159,19 @@ TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_single_batch) {
     this->test_dynamic_quantization(false, {-1, 1, 1, 4096}, {1, 1, 1, 4096});
 }
 
+TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_asym_act) {
+    this->test_dynamic_quantization(false, {-1, 1, 1, 4096}, {1, 1, 1, 4096}, QuantizationType::Asymmetric, UINT64_MAX,
+                                    data_types::u8, data_types::u8, OutputStorageType::Planar);
+}
+
+TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size_grouped) {
+    this->test_dynamic_quantization(false, {1, 1, 4096}, {64, 1, 4096}, QuantizationType::Symmetric, 32);
+}
+
+TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_single_batch_grouped) {
+    this->test_dynamic_quantization(false, {-1, 1, 4096}, {1, 1, 4096}, QuantizationType::Symmetric, 32);
+}
+
 TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_ref_only) {
     this->test_dynamic_quantization(false, {-1, 1, 1, 33}, {16, 1, 1, 33});
 }
@@ -177,33 +193,36 @@ TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_unaligned_dynamic) {
 }
 
 TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache) {
-    this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache");
+    this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Symmetric, UINT64_MAX,
+                                data_types::i8, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache");
 }
 
 TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched) {
-    this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache");
+    this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Symmetric, UINT64_MAX,
+                                data_types::i8, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache");
 }
 
 TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered) {
-    this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache");
+    this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Symmetric, UINT64_MAX,
+                                data_types::i8, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache");
 }
 
 TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered) {
-    this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache");
+    this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Symmetric, UINT64_MAX,
+                                data_types::i8, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache");
 }
 
 TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_asym) {
-    this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache");
+    this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Asymmetric, UINT64_MAX,
+                                data_types::i8, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache");
 }
 
 TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_asym) {
-    this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache");
+    this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Asymmetric, UINT64_MAX,
+                                data_types::i8, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache");
 }
 
 TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered_asym) {
-    this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache");
-}
-
-TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered_asym) {
-    this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache");
+    this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Asymmetric, UINT64_MAX,
+                                data_types::i8, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache");
 }
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
index 6bf44a31add0f4..f59dc5c42cffc1 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -1555,7 +1555,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto config = get_test_default_config(engine);
         config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
         config.set_property(ov::intel_gpu::optimize_data(true));
-        config.set_property(ov::hint::dynamic_quantization_group_size(32));
+        config.set_user_property(ov::hint::dynamic_quantization_group_size(32));
 
         network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
 
@@ -1643,7 +1643,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
             config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
             ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
             config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
-            config.set_property(ov::hint::dynamic_quantization_group_size(0));
+            config.set_user_property(ov::hint::dynamic_quantization_group_size(0));
 
             network network(engine, topology, config);
             network.set_input_data("input", input_mem);
@@ -1669,7 +1669,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto config = get_test_default_config(engine);
         config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
         config.set_property(ov::intel_gpu::optimize_data(true));
-        config.set_property(ov::hint::dynamic_quantization_group_size(0));
+        config.set_user_property(ov::hint::dynamic_quantization_group_size(0));
 
         network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
 
@@ -1753,7 +1753,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
             config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
             ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
             config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
-            config.set_property(ov::hint::dynamic_quantization_group_size(0));
+            config.set_user_property(ov::hint::dynamic_quantization_group_size(0));
 
             network network(engine, topology, config);
             network.set_input_data("input", input_mem);
@@ -1780,9 +1780,9 @@ class fully_connected_gpu_tests: public ::testing::Test {
         config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
         config.set_property(ov::intel_gpu::optimize_data(true));
         if (is_dyn_quan) {
-            config.set_property(ov::hint::dynamic_quantization_group_size(32));
+            config.set_user_property(ov::hint::dynamic_quantization_group_size(32));
         } else {
-            config.set_property(ov::hint::dynamic_quantization_group_size(0));
+            config.set_user_property(ov::hint::dynamic_quantization_group_size(0));
         }
 
         network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
@@ -1923,7 +1923,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
             config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
             ov::intel_gpu::ImplementationDesc fc_impl = { in_layout.format, "", impl_types::ocl };
             config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim1", fc_impl }, { "fc_prim2", fc_impl }  }));
-            config.set_property(ov::hint::dynamic_quantization_group_size(0));
+            config.set_user_property(ov::hint::dynamic_quantization_group_size(0));
 
             network network(engine, topology, config);
             network.set_input_data("input", input_mem);
@@ -1952,7 +1952,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto config = get_test_default_config(engine);
         config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
         config.set_property(ov::intel_gpu::optimize_data(true));
-        config.set_property(ov::hint::dynamic_quantization_group_size(0));
+        config.set_user_property(ov::hint::dynamic_quantization_group_size(0));
 
         network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
 
@@ -2905,7 +2905,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
             config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
             ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
             config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
-            config.set_property(ov::hint::dynamic_quantization_group_size(0));
+            config.set_user_property(ov::hint::dynamic_quantization_group_size(0));
 
             network network(engine, topo, config);
             network.set_input_data("input", input_mem);
@@ -2931,7 +2931,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto config = get_test_default_config(engine);
         config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
         config.set_property(ov::intel_gpu::optimize_data(true));
-        config.set_property(ov::hint::dynamic_quantization_group_size(quantize_group_size));
+        config.set_user_property(ov::hint::dynamic_quantization_group_size(quantize_group_size));
 
         network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), false);
 
@@ -3031,7 +3031,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
             config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
             ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl };
             config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
-            config.set_property(ov::hint::dynamic_quantization_group_size(0));
+            config.set_user_property(ov::hint::dynamic_quantization_group_size(0));
 
             network network(engine, topo, config);
             network.set_input_data("input", input_mem);
@@ -3057,7 +3057,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto config = get_test_default_config(engine);
         config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
         config.set_property(ov::intel_gpu::optimize_data(true));
-        config.set_property(ov::hint::dynamic_quantization_group_size(quantize_group_size));
+        config.set_user_property(ov::hint::dynamic_quantization_group_size(quantize_group_size));
 
         network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), false);
 
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp
index fb30222998008b..3384fb1ed514f6 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp
@@ -71,11 +71,11 @@ class check_hash_value: public ::testing::Test {
         const auto primitive_hash = primitve->hash();
         const auto params_hash = primitve->type->get_fake_aligned_params(*prim_inst->get_impl_params()).hash();
         if (!engine.get_device_info().supports_immad) {
-            ASSERT_EQ(primitive_hash, 8017451717095756666UL);
-            ASSERT_EQ(params_hash, 8889154389021912103UL);
+            ASSERT_EQ(primitive_hash, 9510988594087947885UL);
+            ASSERT_EQ(params_hash, 7833603199176871790UL);
         } else {
-            ASSERT_EQ(primitive_hash, 8017451717095756666UL);
-            ASSERT_EQ(params_hash, 10847775446937354749UL);
+            ASSERT_EQ(primitive_hash, 9510988594087947885UL);
+            ASSERT_EQ(params_hash, 16259702189938020305UL);
         }
     }
 

From a3f4edb3d8f12769c7ae7d39206730502fae711f Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Mon, 9 Dec 2024 14:47:37 +0900
Subject: [PATCH 12/23] [GPU] Fix  crash on swiglu fused case (due to outer_ofm
 == 1) (#27972)

### Details:
 - fixed crash happens in minicpm-1b-sft int4 model

### Tickets:
 - *ticket-id*
---
 .../fully_connected_kernel_bf_tiled.cpp            | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
index 68da7aea7b1fe6..d0f881adcd88b1 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -435,10 +435,14 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,
                     return selector.Default(tune_params(1, 1, 4, 4, 1, 1, 1, EXE_MODE_DEFAULT));
                 }
             } else if (is_weight_small_kn(params, output_f)) {
-                if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2)
-                    return selector.Default(tune_params(1, 1, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
-                else
+                if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
+                    if (swiglu_fused)
+                        return selector.Default(tune_params(1, 1, 4, 2, 2, 1, 1, EXE_MODE_DEFAULT));
+                    else
+                        return selector.Default(tune_params(1, 1, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
+                } else {
                     return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
+                }
             } else {
                 if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) {
                     return selector.Default(tune_params(1, 1, 4, 4, 1, 1, 1, EXE_MODE_DEFAULT));
@@ -865,7 +869,9 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
     auto output_f = get_output_aligned_bf_size(fc_params, false).second;
 
     WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16;
-    if (!is_swiglu_fused(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
+    if (is_swiglu_fused(fc_params)) {
+        weights_layout = WeightsLayout::os_is_yx_osv32_isv2;
+    } else if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
         && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)
         && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4)
         && is_weight_horizontal(fc_params, output_f)) {

From 27138a8af6b9cd8e79b394ab5b56b4c61fd7deba Mon Sep 17 00:00:00 2001
From: Sebastian Golebiewski <sebastianx.golebiewski@intel.com>
Date: Mon, 9 Dec 2024 07:40:37 +0100
Subject: [PATCH 13/23] [DOCS] saveModelSync method in Node.js addon (#27960)

Porting: #27958

Signed-off-by: sgolebiewski-intel <sebastianx.golebiewski@intel.com>
---
 docs/sphinx_setup/api/nodejs_api/addon.rst | 37 ++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/docs/sphinx_setup/api/nodejs_api/addon.rst b/docs/sphinx_setup/api/nodejs_api/addon.rst
index f6ee4ab7b15836..7c42824bcd88a3 100644
--- a/docs/sphinx_setup/api/nodejs_api/addon.rst
+++ b/docs/sphinx_setup/api/nodejs_api/addon.rst
@@ -49,6 +49,7 @@ The **openvino-node** package exports ``addon`` which contains the following pro
          resizeAlgorithm: typeof resizeAlgorithm;
          PrePostProcessor: PrePostProcessorConstructor;
        };
+       saveModelSync(model: Model, path: string, compressToFp16?: boolean): void;
        element: typeof element;
      }
 
@@ -142,3 +143,39 @@ Properties
    -  **Defined in:**
       `addon.ts:674 <https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/node/lib/addon.ts#L674>`__
 
+
+.. rubric:: saveModelSync
+
+*
+
+   .. code-block:: ts
+
+      saveModelSync(model: Model, path: string, compressToFp16?: boolean): void;
+
+
+   This method saves a model to IR (xml and bin files), applying all
+   necessary transformations that are usually added during model conversion.
+   Particularly, weights are compressed to FP16 by default, and debug information
+   in model nodes is cleaned up.
+
+   * **Parameters:**
+
+     - model: :doc:`Model <openvino-node/interfaces/Model>`
+
+       A model which will be converted to IR and saved.
+
+     - path: string
+
+       A path for saving the model.
+
+     - ``Optional``
+
+       - compressToFp16: boolean
+
+         Compression of weights to FP16 floating point precision. The default value is `true` .
+
+   * **Returns:**  void
+
+   * **Defined in:**
+     `addon.ts:692 <https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/node/lib/addon.ts#L692>`__
+

From 15a9b617fcfd591a14daf632cdeecbe99255bd64 Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Mon, 9 Dec 2024 12:33:16 +0400
Subject: [PATCH 14/23] [TF FE] Run If tests on all platforms (#27966)

**Details:** Run If tests on all platforms

**Ticket:** TBD

---------

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 .../tensorflow_tests/test_tf_If.py            | 44 ++++++++-----------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/tests/layer_tests/tensorflow_tests/test_tf_If.py b/tests/layer_tests/tensorflow_tests/test_tf_If.py
index 67686ef53a5750..21dee5aa28616d 100644
--- a/tests/layer_tests/tensorflow_tests/test_tf_If.py
+++ b/tests/layer_tests/tensorflow_tests/test_tf_If.py
@@ -1,13 +1,13 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import platform
-
 import numpy as np
 import pytest
 import tensorflow as tf
 from common.tf_layer_test_class import CommonTFLayerTest
 
+rng = np.random.default_rng(32345)
+
 
 class TestIfFloat(CommonTFLayerTest):
     def _prepare_input(self, inputs_info):
@@ -18,9 +18,9 @@ def _prepare_input(self, inputs_info):
         x_shape = inputs_info['x:0']
         y_shape = inputs_info['y:0']
         inputs_data = {}
-        inputs_data['cond:0'] = np.random.randint(0, 2, cond_shape).astype(bool)
-        inputs_data['x:0'] = np.random.randint(1, 10, x_shape).astype(np.float32)
-        inputs_data['y:0'] = np.random.randint(-50, 50, y_shape).astype(np.float32)
+        inputs_data['cond:0'] = rng.integers(0, 2, cond_shape).astype(bool)
+        inputs_data['x:0'] = rng.integers(1, 10, x_shape).astype(np.float32)
+        inputs_data['y:0'] = rng.integers(-50, 50, y_shape).astype(np.float32)
         return inputs_data
 
     def create_if_net(self, x_shape, y_shape, lower_control_flow):
@@ -69,12 +69,10 @@ def else_branch():
     @pytest.mark.parametrize("params", test_data_basic)
     @pytest.mark.precommit
     @pytest.mark.nightly
-    @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
-                       reason='Ticket - 122716')
     def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir,
                       use_legacy_frontend):
         if ie_device == 'GPU':
-            pytest.xfail('104855')
+            pytest.xfail('104855: If operation is not supported by GPU')
         self._test(*self.create_if_net(**params),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
                    use_legacy_frontend=use_legacy_frontend)
@@ -89,9 +87,9 @@ def _prepare_input(self, inputs_info):
         ind_shape = inputs_info['ind:0']
         y_shape = inputs_info['y:0']
         inputs_data = {}
-        inputs_data['cond:0'] = np.random.randint(0, 2, cond_shape).astype(bool)
-        inputs_data['ind:0'] = np.random.randint(1, 10, ind_shape).astype(np.int32)
-        inputs_data['y:0'] = np.random.randint(-50, 50, y_shape).astype(np.float32)
+        inputs_data['cond:0'] = rng.integers(0, 2, cond_shape).astype(bool)
+        inputs_data['ind:0'] = rng.integers(1, 10, ind_shape).astype(np.int32)
+        inputs_data['y:0'] = rng.integers(-50, 50, y_shape).astype(np.float32)
         return inputs_data
 
     def create_if_net(self, ind_shape, y_shape, lower_control_flow):
@@ -141,12 +139,10 @@ def else_branch():
     @pytest.mark.parametrize("params", test_data_basic)
     @pytest.mark.precommit
     @pytest.mark.nightly
-    @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
-                       reason='Ticket - 122716')
     def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir,
                       use_legacy_frontend):
         if ie_device == 'GPU':
-            pytest.xfail('104855')
+            pytest.xfail('104855: If operation is not supported by GPU')
         self._test(*self.create_if_net(**params),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
                    use_legacy_frontend=use_legacy_frontend)
@@ -161,9 +157,9 @@ def _prepare_input(self, inputs_info):
         y_shape = inputs_info['y:0']
         z_shape = inputs_info['z:0']
         inputs_data = {}
-        inputs_data['x:0'] = np.random.randint(0, 6, x_shape).astype(np.int32)
-        inputs_data['y:0'] = np.random.randint(1, 10, y_shape).astype(np.float32)
-        inputs_data['z:0'] = np.random.randint(-50, 50, z_shape).astype(np.float32)
+        inputs_data['x:0'] = rng.integers(0, 6, x_shape).astype(np.int32)
+        inputs_data['y:0'] = rng.integers(1, 10, y_shape).astype(np.float32)
+        inputs_data['z:0'] = rng.integers(-50, 50, z_shape).astype(np.float32)
         return inputs_data
 
     def create_if_net(self, y_shape, z_shape, lower_control_flow):
@@ -221,12 +217,10 @@ def else_branch():
     @pytest.mark.parametrize("params", test_data_basic)
     @pytest.mark.precommit
     @pytest.mark.nightly
-    @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
-                       reason='Ticket - 122716')
     def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir,
                       use_legacy_frontend):
         if ie_device == 'GPU':
-            pytest.xfail('104855')
+            pytest.xfail('104855: If operation is not supported by GPU')
         self._test(*self.create_if_net(**params),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
                    use_legacy_frontend=use_legacy_frontend)
@@ -241,9 +235,9 @@ def _prepare_input(self, inputs_info):
         x_shape = inputs_info['x:0']
         y_shape = inputs_info['y:0']
         inputs_data = {}
-        inputs_data['cond:0'] = np.random.randint(0, 2, cond_shape).astype(bool)
-        inputs_data['x:0'] = np.random.randint(1, 10, x_shape).astype(np.float32)
-        inputs_data['y:0'] = np.random.randint(-50, 50, y_shape).astype(np.float32)
+        inputs_data['cond:0'] = rng.integers(0, 2, cond_shape).astype(bool)
+        inputs_data['x:0'] = rng.integers(1, 10, x_shape).astype(np.float32)
+        inputs_data['y:0'] = rng.integers(-50, 50, y_shape).astype(np.float32)
         return inputs_data
 
     def create_sequential_ifs_net(self, x_shape, y_shape, lower_control_flow):
@@ -313,12 +307,10 @@ def else_branch():
     @pytest.mark.parametrize("params", test_data_basic)
     @pytest.mark.precommit
     @pytest.mark.nightly
-    @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
-                       reason='Ticket - 122716')
     def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir,
                       use_legacy_frontend):
         if ie_device == 'GPU':
-            pytest.xfail('104855')
+            pytest.xfail('104855: If operation is not supported by GPU')
         self._test(*self.create_sequential_ifs_net(**params),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
                    use_legacy_frontend=use_legacy_frontend)

From 408a5e065200b1fcb41200f9361094fa1c7df5d7 Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mingyu.kim@intel.com>
Date: Mon, 9 Dec 2024 17:49:45 +0900
Subject: [PATCH 15/23] [GPU] update onednn to latest 3.7-pc (#27811)

---
 src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
index 0f269193c74663..36e090a367a431 160000
--- a/src/plugins/intel_gpu/thirdparty/onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit 0f269193c7466313888d3338209d0d06a22cc6fa
+Subproject commit 36e090a367a4312a1caa2db9e95fb94d17d7573b

From de949b4a2b59faf1bf701528dd37b7ecd076d4e0 Mon Sep 17 00:00:00 2001
From: Yuan Hu <yuan2.hu@intel.com>
Date: Mon, 9 Dec 2024 17:08:40 +0800
Subject: [PATCH 16/23] [CPU] enable brdgmm kernel in CPU plugin (#27589)

### Details:
 - *replace  impl string brdgmm with brgconv*
 - *add test case*
 - *remove skip CVS-56143 config, CVS-56143 is already closed*
 - *remove skip CVS-53578 config, CVS-53578 is already closed*
 - *use new ticket CVS-157596 to track leftover test case*

### Tickets:
 - *CVS-156792*

---------

Signed-off-by: HU Yuan2 <yuan2.hu@intel.com>
---
 src/plugins/intel_cpu/src/nodes/conv.cpp      |  13 +-
 .../intel_cpu/src/onednn/iml_type_mapper.cpp  |   3 +
 .../intel_cpu/src/onednn/iml_type_mapper.h    |   3 +
 .../single_layer_tests/group_convolution.cpp  | 126 +++++++++++++++++-
 .../skip_tests_config.cpp                     |  10 +-
 5 files changed, 140 insertions(+), 15 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp
index 7cf7698e989343..53d53d093cfabf 100644
--- a/src/plugins/intel_cpu/src/nodes/conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/conv.cpp
@@ -343,6 +343,7 @@ const std::vector<impl_desc_type>& Convolution::getDefaultImplPriority() {
             impl_desc_type::winograd_acl,
             impl_desc_type::gemm_acl,
             impl_desc_type::acl,
+            impl_desc_type::brgconv_avx512_dw,
             impl_desc_type::brgconv_avx512_amx_1x1,
             impl_desc_type::brgconv_avx512_amx,
             impl_desc_type::jit_avx512_amx_dw,
@@ -353,6 +354,7 @@ const std::vector<impl_desc_type>& Convolution::getDefaultImplPriority() {
             impl_desc_type::jit_avx512_dw,
             impl_desc_type::jit_avx512_1x1,
             impl_desc_type::jit_avx512,
+            impl_desc_type::brgconv_avx2_dw,
             impl_desc_type::brgconv_avx2_1x1,
             impl_desc_type::brgconv_avx2,
             impl_desc_type::jit_uni_dw,
@@ -815,7 +817,11 @@ void Convolution::initSupportedPrimitiveDescriptors() {
 #endif
     for (size_t dIdx = 0; dIdx < descs.size(); dIdx++) {
         auto& desc = descs[dIdx];
-        auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get()));
+        auto primitive_desc = desc.get(true); //true mean allow empty
+        if (primitive_desc == nullptr) {
+            continue;
+        }
+        auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(primitive_desc));
 
         auto add_supported_desc = [&](dnnl::primitive_desc& desc) {
             addSupportedPrimitiveDescriptor(desc);
@@ -823,7 +829,7 @@ void Convolution::initSupportedPrimitiveDescriptors() {
         };
 
         const bool first_match = customImplPriorities.empty();
-        DEBUG_LOG("#", getName(),
+        DEBUG_LOG("#", getName(), ",descIndex:", dIdx + 1, "/", descs.size(),
                        ", itpd.impl_info_str(): ", desc.impl_info_str(),
                     ", parsed imp_type: ", impl_type_to_string(parse_impl_name(desc.impl_info_str())),
                     ", first_match: ", first_match ? "true" : "false");
@@ -944,8 +950,7 @@ void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
             const auto desc = createDescriptorInternal(getEngine(),
                                                        inDnnlDesc, weightDnnlDesc, biasDnnlDesc, outDnnlDesc, withBiases,
                                                        stride, dilation, paddingL, paddingR, alg, attr);
-            if (desc)
-                descs.emplace_back(desc);
+            descs.emplace_back(desc);
         }
     }
 }
diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp
index d7a1e5979ddad9..5c57a94f69f67d 100644
--- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp
+++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp
@@ -17,6 +17,7 @@ impl_desc_type parse_impl_name(std::string impl_desc_name) {
     if (pos != std::string::npos) impl_desc_name.replace(pos, std::string(#_wrd).length(), #_sub); }
     // Replace the ONEDNN pd name with OV definition.
     REPLACE_WORD(brg_conv, brgconv);
+    REPLACE_WORD(brdgmm, brgconv);
     REPLACE_WORD(avx10_1_512, avx512);
     REPLACE_WORD(brg_matmul, brgemm);
 
@@ -119,6 +120,8 @@ const char* impl_type_to_string(impl_desc_type type) {
     CASE(brgconv_sse42_1x1);
     CASE(brgconv_uni_1x1);
     CASE(brgconv_avx512_amx_1x1);
+    CASE(brgconv_avx512_dw);
+    CASE(brgconv_avx2_dw);
     CASE(brgemm_avx512);
     CASE(brgemm_avx2);
     CASE(brgemm_avx);
diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h
index 3fd79716c7cd72..45a71bdb88dd33 100644
--- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h
+++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h
@@ -98,6 +98,9 @@ enum impl_desc_type : int64_t {
     brgconv_uni_1x1         = brgconv  | uni | _1x1,
     brgconv_avx512_amx_1x1  = brgconv  | avx512 | amx | _1x1,
 
+    brgconv_avx2_dw    = brgconv_avx2 | _dw,
+    brgconv_avx512_dw    = brgconv_avx512 | _dw,
+
     brgemm_avx512      = brgemm  | avx512,
     brgemm_avx2        = brgemm  | avx2,
     brgemm_avx         = brgemm  | avx,
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp
index 47d7d3072b7337..f3f5b1f2e07975 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp
@@ -5,6 +5,7 @@
 #include "shared_test_classes/single_op/group_convolution.hpp"
 
 #include "common_test_utils/node_builders/group_convolution.hpp"
+#include "openvino/runtime/system_conf.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
 #include "utils/convolution_params.hpp"
 #include "utils/cpu_test_utils.hpp"
@@ -176,14 +177,15 @@ class GroupConvolutionLayerCPUTest : public testing::WithParamInterface<groupCon
         std::tie(groupConvParams, netType, inType, outType, inputShape, targetDevice) = basicParamsSet;
 
         init_input_shapes({inputShape});
-
-        if (configuration.count(ov::hint::inference_precision.name()) &&
-            ov::element::bf16 == configuration[ov::hint::inference_precision.name()].as<ov::element::Type>()) {
-            selectedType += "_bf16";
-            rel_threshold = 1e-2f;
-        } else {
-            selectedType = makeSelectedTypeStr(selectedType, netType);
+        const auto& it = configuration.find(ov::hint::inference_precision.name());
+        if (it != configuration.end()) {
+            if (ov::element::bf16 == it->second.as<ov::element::Type>()) {
+                rel_threshold = 1e-2f;
+            } else if (ov::element::f16 == it->second.as<ov::element::Type>()) {
+                rel_threshold = 0.00125f;
+            }
         }
+        selectedType = makeSelectedTypeStr(selectedType, deduce_expected_precision(netType, configuration));
 
         // according to range propagation feature, resolution of generated inputs data for parameters moved from 32 to 32768
         // 'real' part of input data was changed and some fails became visible for cases with Elu and FakeQuantize, so let's setup abs_threshold
@@ -289,6 +291,7 @@ std::vector<CPUSpecificParams> filterCPUInfoForDeviceSupportBF16(std::vector<CPU
     }
     return resParamsSet;
 }
+
 /* ===================== */
 
 /* COMMON PARAMS */
@@ -313,6 +316,33 @@ const std::vector<fusingSpecificParams> fusingParamsSetBF16{emptyFusingSpec,
                                                             // sum
                                                             fusingSum};
 
+const std::vector<fusingSpecificParams> fusingParamsSet_Brdgmm{emptyFusingSpec,
+                                                               // eltwise
+                                                               fusingRelu,
+                                                               fusingPRelu1D,
+                                                               // depthwise
+                                                               fusingReluScaleShift,
+                                                               // fake quantize
+                                                               fusingFakeQuantizePerTensorRelu,
+                                                               fusingFakeQuantizePerChannelRelu
+                                                               // sum
+                                                               // comment out sum due to MFDNN-12841
+                                                               //fusingSumEluFQ,
+                                                               //fusingSum
+                                                              };
+
+const std::vector<fusingSpecificParams> fusingParamsSetBF16_Brdgmm{emptyFusingSpec,
+                                                                   // eltwise
+                                                                   fusingRelu,
+                                                                   // depthwise
+                                                                   fusingReluScaleShift
+                                                                   // sum
+                                                                   // comment out sum due to MFDNN-12841
+                                                                   //fusingSum
+                                                                  };
+
+const std::vector<fusingSpecificParams> fusingParamsSetFP16_Brdgmm = fusingParamsSetBF16_Brdgmm;
+
 /* ============= GroupConvolution params (planar layout) ============= */
 const std::vector<size_t> numOutChannels_Gemm = {6};
 const std::vector<size_t> numGroups_Gemm = {2, 3};
@@ -1299,6 +1329,38 @@ INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_FP32,
                                             ::testing::Values(empty_plugin_config)),
                          GroupConvolutionLayerCPUTest::getTestCaseName);
 
+const std::vector<std::vector<size_t>> dilations2d_Brdgmm = {{1, 1}};
+const auto groupConvParams_ExplicitPadding_DW_2D_Brdgmm = ::testing::Combine(::testing::ValuesIn(kernels2d),
+                                                                      ::testing::ValuesIn(strides2d),
+                                                                      ::testing::ValuesIn(padBegins2d),
+                                                                      ::testing::ValuesIn(padEnds2d),
+                                                                      ::testing::ValuesIn(dilations2d_Brdgmm),
+                                                                      ::testing::ValuesIn(numOutChannels_DW),
+                                                                      ::testing::ValuesIn(numGroups_DW),
+                                                                      ::testing::Values(ov::op::PadType::EXPLICIT));
+const auto BrdgmmCPUSpec = []()-> std::vector<CPUSpecificParams> {
+    std::string isaStr;
+    if (ov::with_cpu_x86_avx512f()) {
+        isaStr = "avx512";
+    } else {
+        isaStr = "avx2";
+    }
+    return {CPUSpecificParams{{}, {}, {}, "brgconv_" + isaStr + "_dw"}};
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_FP32_Brdgmm,
+                         GroupConvolutionLayerCPUTest,
+                         ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_2D_Brdgmm,
+                                                               ::testing::Values(ElementType::f32),
+                                                               ::testing::Values(ElementType::undefined),
+                                                               ::testing::Values(ElementType::undefined),
+                                                               ::testing::ValuesIn(inputShapes2dDW),
+                                                               ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                                            ::testing::ValuesIn(filterCPUInfoForDevice(BrdgmmCPUSpec())),
+                                            ::testing::ValuesIn(fusingParamsSet_Brdgmm),
+                                            ::testing::Values(empty_plugin_config)),
+                         GroupConvolutionLayerCPUTest::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_BF16,
                          GroupConvolutionLayerCPUTest,
                          ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_2D,
@@ -1313,6 +1375,32 @@ INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_BF16,
                                             ::testing::Values(cpu_bf16_plugin_config)),
                          GroupConvolutionLayerCPUTest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_BF16_Brdgmm,
+                         GroupConvolutionLayerCPUTest,
+                         ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_2D_Brdgmm,
+                                                               ::testing::Values(ElementType::f32),
+                                                               ::testing::Values(ElementType::undefined),
+                                                               ::testing::Values(ElementType::undefined),
+                                                               ::testing::ValuesIn(inputShapes2dDW),
+                                                               ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                                            ::testing::ValuesIn(filterCPUInfoForDeviceSupportBF16(BrdgmmCPUSpec())),
+                                            ::testing::ValuesIn(fusingParamsSetBF16_Brdgmm),
+                                            ::testing::Values(cpu_bf16_plugin_config)),
+                         GroupConvolutionLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_2D_DW_FP16_Brdgmm,
+                         GroupConvolutionLayerCPUTest,
+                         ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_2D_Brdgmm,
+                                                               ::testing::Values(ElementType::f32),
+                                                               ::testing::Values(ElementType::undefined),
+                                                               ::testing::Values(ElementType::undefined),
+                                                               ::testing::ValuesIn(inputShapes2dDW),
+                                                               ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                                            ::testing::ValuesIn(filterCPUInfoForDevice(BrdgmmCPUSpec())),
+                                            ::testing::ValuesIn(fusingParamsSetFP16_Brdgmm),
+                                            ::testing::Values(cpu_f16_plugin_config)),
+                         GroupConvolutionLayerCPUTest::getTestCaseName);
+
 /* ============= GroupConvolution (DW 3D) ============= */
 const auto groupConvParams_ExplicitPadding_DW_3D = ::testing::Combine(::testing::ValuesIn(kernels3d),
                                                                       ::testing::ValuesIn(strides3d),
@@ -1349,6 +1437,30 @@ INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_3D_DW_FP32,
                                             ::testing::ValuesIn(fusingParamsSet),
                                             ::testing::Values(empty_plugin_config)),
                          GroupConvolutionLayerCPUTest::getTestCaseName);
+
+const std::vector<std::vector<size_t>> dilations3d_Brdgmm = {{1, 1, 1}};
+const auto groupConvParams_ExplicitPadding_DW_3D_Brdgmm = ::testing::Combine(::testing::ValuesIn(kernels3d),
+                                                                      ::testing::ValuesIn(strides3d),
+                                                                      ::testing::ValuesIn(padBegins3d),
+                                                                      ::testing::ValuesIn(padEnds3d),
+                                                                      ::testing::ValuesIn(dilations3d_Brdgmm),
+                                                                      ::testing::ValuesIn(numOutChannels_DW),
+                                                                      ::testing::ValuesIn(numGroups_DW),
+                                                                      ::testing::Values(ov::op::PadType::EXPLICIT));
+
+INSTANTIATE_TEST_SUITE_P(smoke_GroupConv_3D_DW_FP32_Brdgmm,
+                         GroupConvolutionLayerCPUTest,
+                         ::testing::Combine(::testing::Combine(groupConvParams_ExplicitPadding_DW_3D_Brdgmm,
+                                                               ::testing::Values(ElementType::f32),
+                                                               ::testing::Values(ElementType::undefined),
+                                                               ::testing::Values(ElementType::undefined),
+                                                               ::testing::ValuesIn(inputShapes3dDW),
+                                                               ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                                            ::testing::ValuesIn(filterCPUInfoForDevice(BrdgmmCPUSpec())),
+                                            ::testing::ValuesIn(fusingParamsSet_Brdgmm),
+                                            ::testing::Values(empty_plugin_config)),
+                         GroupConvolutionLayerCPUTest::getTestCaseName);
+
 /* ========= */
 
 /* ============= SINGLE TEST CASES ============= */
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index b675a7c2da7d42..089a03b4d6bba7 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -40,10 +40,12 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*BinaryConvolutionLayerTest.*)",
         // TODO: 53618. BF16 gemm ncsp convolution crash
         R"(.*_GroupConv.*_inFmts=nc.*_primitive=jit_gemm.*ENFORCE_BF16=YES.*)",
-        // TODO: 53578. fork DW bf16 convolution does not support 3d cases yet
-        R"(.*_DW_GroupConv.*_inFmts=(ndhwc|nCdhw16c).*ENFORCE_BF16=YES.*)",
-        // TODO: 56143. Enable nspc convolutions for bf16 precision
-        R"(.*ConvolutionLayerCPUTest.*_inFmts=(ndhwc|nhwc).*INFERENCE_PRECISION_HINT=bf16.*)",
+        // TODO: 157596 convolution bf16 leftover test case
+        R"(smoke_JIT_AVX512_DW_GroupConv/GroupConvolutionLayerCPUTest.*ndhwc.*jit_avx512_dw.*INFERENCE_PRECISION_HINT=bf16.*)",
+        R"(smoke_Conv_1D_1x1_BF16/ConvolutionLayerCPUTest\.CompareWithRefs/IS=\[\]_TS=\(\((1|2)\.6(4|7)\.7\)_\)_K\(1\)_S\(1\)_PB\(0\)_PE\(0\)_D=\(1\)_O=63_AP=explicit_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPU_inFmts=nhwc_outFmts=nhwc_primitive=jit_avx512_1x1_.*PluginConf_INFERENCE_PRECISION_HINT=bf16)",
+        R"(smoke_Conv_1D_1x1_BF16/ConvolutionLayerCPUTest\.CompareWithRefs/IS=\[1\.\.200\.64\.\?\]_TS=\(\(2\.64\.7\)_\(1\.64\.5\)_\)_K\(1\)_S\(1\)_PB\(0\)_PE\(0\)_D=\(1\)_O=63_AP=explicit_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPU_inFmts=nhwc_outFmts=nhwc_primitive=jit_avx512_1x1_.*PluginConf_INFERENCE_PRECISION_HINT=bf16)",
+        R"(smoke_Conv_1D_1x1_BF16/ConvolutionLayerCPUTest\.CompareWithRefs/IS=\[\?\.6(4|7)\.1\.\.200\]_TS=\(\(2\.6(4|7)\.7\)_\(1\.6(4|7)\.9\)_\)_K\(1\)_S\(1\)_PB\(0\)_PE\(0\)_D=\(1\)_O=63_AP=explicit_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPU_inFmts=nhwc_outFmts=nhwc_primitive=jit_avx512_1x1_.*PluginConf_INFERENCE_PRECISION_HINT=bf16)",
+        R"(smoke_GroupConv_brgemm_2D_BF16/GroupConvolutionLayerCPUTest\.CompareWithRefs/IS=\[\]_TS=\(\(1\.64\.7\.7\)_\)_K\(3\.3\)_S\(2\.2\)_PB\((0|1)\.(0|1)\)_PE\(0\.0\)_D=\(2\.2\)_O=64_G=2_AP=explicit_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPU_inFmts=nhwc_outFmts=nhwc_primitive=brgconv_avx512_amx_.*PluginConf_INFERENCE_PRECISION_HINT=bf16)",
         // TODO: 56827. Sporadic test failures
         R"(.*smoke_Conv.+_FP32.ConvolutionLayerCPUTest\.CompareWithRefs.*TS=\(\(.\.67.+\).*inFmts=n.+c.*_primitive=jit_avx2.*)",
         // incorrect jit_uni_planar_convolution with dilation = {1, 2, 1} and output channel 1

From de776f279c87e542c640acc8140aaf87f278c991 Mon Sep 17 00:00:00 2001
From: Andrei Kashchikhin <andrey.kashchikhin@intel.com>
Date: Mon, 9 Dec 2024 09:27:11 +0000
Subject: [PATCH 17/23] [CI] [GHA] Introduce additional Python (3.9-3.12) API
 tests on macOS (#27666)

### Details:
 - Based on #27304, should be reviewed after it.

### Tickets:
 - *152690*
---
 .github/workflows/job_python_api_tests.yml  | 142 ++++++++++++++++++++
 .github/workflows/job_python_unit_tests.yml |  54 ++------
 .github/workflows/job_samples_tests.yml     |  14 +-
 .github/workflows/linux_arm64.yml           |  10 ++
 .github/workflows/mac.yml                   |  60 ++++++++-
 .github/workflows/mac_arm64.yml             |  57 +++++++-
 .github/workflows/ubuntu_22.yml             |  10 ++
 .github/workflows/ubuntu_24.yml             |  10 ++
 8 files changed, 304 insertions(+), 53 deletions(-)
 create mode 100644 .github/workflows/job_python_api_tests.yml

diff --git a/.github/workflows/job_python_api_tests.yml b/.github/workflows/job_python_api_tests.yml
new file mode 100644
index 00000000000000..541a14e2b1b6df
--- /dev/null
+++ b/.github/workflows/job_python_api_tests.yml
@@ -0,0 +1,142 @@
+name: Python API tests
+
+on:
+  workflow_call:
+    inputs:
+      runner:
+        description: 'Machine on which the tests would run'
+        type: string
+        required: true
+      container:
+        description: 'JSON to be converted to the value of the "container" configuration for the job'
+        type: string
+        required: false
+        default: '{"image": null}'
+      python-version:
+        description: 'Python version to setup. E.g., "3.11"'
+        type: string
+        required: true
+
+permissions: read-all
+
+env:
+  PIP_CACHE_PATH: /mount/caches/pip/linux
+
+jobs:
+  Python_Unit_Tests:
+    name: Python API tests
+    timeout-minutes: 30
+    runs-on: ${{ inputs.runner }}
+    container: ${{ fromJSON(inputs.container) }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
+      OPENVINO_REPO: ${{ github.workspace }}/openvino
+      INSTALL_DIR: ${{ github.workspace }}/install
+      INSTALL_TEST_DIR: ${{ github.workspace }}/install/openvino_tests
+      INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/openvino_wheels
+    steps:
+      - name: Download OpenVINO artifacts (tarballs and wheels)
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          pattern: openvino_@(wheels|tests)
+          path: ${{ env.INSTALL_DIR }}
+
+      # Needed as ${{ github.workspace }} is not working correctly when using Docker
+      - name: Setup Variables
+        run: |
+          echo "OPENVINO_REPO=$GITHUB_WORKSPACE/openvino" >> "$GITHUB_ENV"
+          echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
+          echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/openvino_tests" >> "$GITHUB_ENV"
+          echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/openvino_wheels" >> "$GITHUB_ENV"
+
+      - name: Install OpenVINO dependencies (mac)
+        if: runner.os == 'macOS'
+        run: brew install pigz
+
+      - name: Extract OpenVINO packages
+        run: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_TEST_DIR}
+        working-directory: ${{ env.INSTALL_TEST_DIR }}
+
+      - name: Fetch setup_python and install wheels actions
+        uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+        with:
+          sparse-checkout: |
+            .github/actions/setup_python/action.yml
+            .github/actions/install_ov_wheels/action.yml
+          sparse-checkout-cone-mode: false
+          path: 'action_root'
+
+      - name: Setup Python ${{ inputs.python-version }}
+        uses: ./action_root/.github/actions/setup_python
+        with:
+          version: ${{ inputs.python-version }}
+          pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }}
+          should-setup-pip-paths: ${{ runner.os == 'Linux' }}
+          self-hosted-runner: ${{ runner.os == 'Linux' }}
+
+      #
+      # Tests
+      #
+      - name: Install OpenVINO Python wheels
+        uses: ./action_root/.github/actions/install_ov_wheels
+        with:
+          wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }}
+          wheels-to-install: 'openvino'
+
+      - name: Install Python API tests dependencies
+        run: python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/bindings/python/requirements_test.txt
+
+      #
+      # Tests
+      #
+
+      - name: Python API Tests
+        run: |
+          # for 'template' extension
+          export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH
+          python3 -m pytest -sv ${INSTALL_TEST_DIR}/tests/pyopenvino \
+            --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \
+            --ignore=${INSTALL_TEST_DIR}/tests/pyopenvino/tests/test_utils/test_utils.py
+
+      - name: Python API Tests -- numpy>=2.0.0
+        run: |
+          python3 -m pip uninstall -y numpy
+          python3 -m pip install "numpy~=2.0.0"
+          python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/bindings/python/requirements_test.txt
+          # for 'template' extension
+          export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH
+          python3 -m pytest -sv ${INSTALL_TEST_DIR}/tests/pyopenvino \
+            --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph_new_numpy.xml \
+            --ignore=${INSTALL_TEST_DIR}/tests/pyopenvino/tests/test_utils/test_utils.py
+
+      - name: Clone API snippets
+        if: runner.os != 'macOS'
+        uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+        with:
+          sparse-checkout: docs/articles_en/assets/snippets
+          path: ${{ env.OPENVINO_REPO }}
+          submodules: 'false'
+
+      - name: Docs Python snippets
+        if: runner.os != 'macOS'
+        run: |
+          # torch, onnx
+          python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/python/preprocess/torchvision/requirements.txt -r ${INSTALL_TEST_DIR}/tests/requirements_onnx
+          # to find 'snippets' module in docs
+          export PYTHONPATH=${OPENVINO_REPO}/docs/articles_en/assets
+          # for 'template' extension
+          export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH
+          python3 ${OPENVINO_REPO}/docs/articles_en/assets/snippets/main.py
+
+      - name: Upload Test Results
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        if: ${{ !cancelled() }}
+        with:
+          name: test-results-python-api-${{ inputs.python-version }}
+          path: |
+            ${{ env.INSTALL_TEST_DIR }}/TEST*.html
+            ${{ env.INSTALL_TEST_DIR }}/TEST*.xml
+          if-no-files-found: 'warn'
diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml
index 8075f3299fe063..47506c83bf0945 100644
--- a/.github/workflows/job_python_unit_tests.yml
+++ b/.github/workflows/job_python_unit_tests.yml
@@ -65,21 +65,22 @@ jobs:
           echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
           echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
           echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV"
+          echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV"
 
       - name: Install OpenVINO dependencies (mac)
         if: runner.os == 'macOS'
         run: brew install pigz
           
       - name: Extract OpenVINO packages
-        run: |
-            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
+        run: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
-      - name: Fetch setup_python action
+      - name: Fetch setup_python and install wheels actions
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           sparse-checkout: |
             .github/actions/setup_python/action.yml
+            .github/actions/install_ov_wheels/action.yml
           sparse-checkout-cone-mode: false
           path: 'action_root'
 
@@ -92,11 +93,10 @@ jobs:
           self-hosted-runner: ${{ runner.os == 'Linux' }}
 
       - name: Install OpenVINO Python wheels
-        run: |
-          # Install the core OV wheel
-          python3 -m pip install ./openvino-*.whl
-
-        working-directory: ${{ env.INSTALL_WHEELS_DIR }}
+        uses: ./action_root/.github/actions/install_ov_wheels
+        with:
+          wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }}
+          wheels-to-install: 'openvino'
 
       - name: Install Python API tests dependencies
         run: |
@@ -121,15 +121,6 @@ jobs:
       # Tests
       #
 
-      - name: Python API Tests
-        if: ${{ fromJSON(inputs.affected-components).Python_API.test }}
-        run: |
-          # for 'template' extension
-          export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH
-          python3 -m pytest -sv ${INSTALL_TEST_DIR}/pyopenvino \
-            --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \
-            --ignore=${INSTALL_TEST_DIR}/pyopenvino/tests/test_utils/test_utils.py
-
       - name: Python ONNX operators tests
         if: (fromJSON(inputs.affected-components).Python_API.test ||
              fromJSON(inputs.affected-components).ONNX_FE.test) &&
@@ -185,35 +176,6 @@ jobs:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP16
 
-      - name: Clone API snippets
-        if: runner.os != 'macOS'
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          sparse-checkout: docs/articles_en/assets/snippets
-          path: ${{ env.OPENVINO_REPO }}
-          submodules: 'false'
-
-      - name: Docs Python snippets
-        if: runner.os != 'macOS'
-        run: |
-          # to find 'snippets' module in docs
-          export PYTHONPATH=${OPENVINO_REPO}/docs/articles_en/assets
-          # for 'template' extension
-          export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH
-          python3 ${OPENVINO_REPO}/docs/articles_en/assets/snippets/main.py
-
-      - name: Python API Tests -- numpy>=2.0.0
-        if: ${{ fromJSON(inputs.affected-components).Python_API.test }}
-        run: |
-          python3 -m pip uninstall -y numpy
-          python3 -m pip install "numpy>=2.0.0,<2.2.0"
-          python3 -m pip install -r ${INSTALL_TEST_DIR}/bindings/python/requirements_test.txt
-          # for 'template' extension
-          export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH
-          python3 -m pytest -sv ${INSTALL_TEST_DIR}/pyopenvino \
-            --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \
-            --ignore=${INSTALL_TEST_DIR}/pyopenvino/tests/test_utils/test_utils.py
-
       - name: Upload Test Results
         uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
diff --git a/.github/workflows/job_samples_tests.yml b/.github/workflows/job_samples_tests.yml
index e144aa0cfb95aa..6f95d316abfc3f 100644
--- a/.github/workflows/job_samples_tests.yml
+++ b/.github/workflows/job_samples_tests.yml
@@ -54,6 +54,7 @@ jobs:
           echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
           echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
           echo "BUILD_DIR=$GITHUB_WORKSPACE/build" >> "$GITHUB_ENV"
+          echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV"
       
       - name: Install OpenVINO dependencies (mac)
         if: runner.os == 'macOS'
@@ -65,13 +66,12 @@ jobs:
             pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
-      - name: Fetch setup_python action
-        # Python is already installed on Ubuntu within Dockerfile
-        if: runner.os != 'Linux'
+      - name: Fetch setup_python and install wheels actions
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           sparse-checkout: |
             .github/actions/setup_python/action.yml
+            .github/actions/install_ov_wheels/action.yml
           sparse-checkout-cone-mode: false
           path: 'openvino'
 
@@ -113,6 +113,12 @@ jobs:
       # Tests
       #
 
+      - name: Install OpenVINO Python wheels
+        uses: ./openvino/.github/actions/install_ov_wheels
+        with:
+          wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }}
+          wheels-to-install: 'openvino'
+
       - name: Samples tests
         if: fromJSON(inputs.affected-components).samples.test
         run: |
@@ -122,7 +128,7 @@ jobs:
           export SHARE=$INSTALL_TEST_DIR/smoke_tests/samples_smoke_tests_data
 
           # Install Python benchmark_app by installing openvino-*.whl
-          python3 -m pip install --ignore-installed PyYAML -r $INSTALL_TEST_DIR/smoke_tests/requirements.txt $INSTALL_WHEELS_DIR/openvino-*.whl
+          python3 -m pip install --ignore-installed PyYAML -r $INSTALL_TEST_DIR/smoke_tests/requirements.txt
           export LD_LIBRARY_PATH=${IE_APP_PATH}:$LD_LIBRARY_PATH
 
           source ${INSTALL_DIR}/setupvars.sh
diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml
index 66ce9461f05fe8..e1aaa886d631c7 100644
--- a/.github/workflows/linux_arm64.yml
+++ b/.github/workflows/linux_arm64.yml
@@ -169,6 +169,16 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.11'
 
+  Python_API_Tests:
+    name: Python API tests
+    needs: [ Docker, Build, Smart_CI ]
+    uses: ./.github/workflows/job_python_api_tests.yml
+    with:
+      runner: 'aks-linux-16-cores-arm'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
+      python-version: '3.11'
+    if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test
+
   TensorFlow_Layer_Tests:
     name: TensorFlow Layer Tests
     needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ]
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index c587c5ad7323b3..26289e969c4e00 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -151,6 +151,7 @@ jobs:
             -DENABLE_CPPLINT=OFF \
             -DENABLE_NCC_STYLE=OFF \
             -DENABLE_TESTS=ON \
+            -DENABLE_WHEEL=OFF \
             -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \
             -DENABLE_STRICT_DEPENDENCIES=OFF \
             -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \
@@ -168,7 +169,6 @@ jobs:
         run: |
           cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake
           cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake
-          cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake
 
       - name: Pack Artifacts
         run: |
@@ -179,6 +179,48 @@ jobs:
             tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz 
           popd
 
+      # Setup additional Python versions for wheels building
+      - name: Setup Python 3.9
+        uses: ./openvino/.github/actions/setup_python
+        with:
+          version: "3.9"
+          should-setup-pip-paths: 'false'
+          self-hosted-runner: 'false'
+
+      - name: Setup Python 3.10
+        uses: ./openvino/.github/actions/setup_python
+        with:
+          version: "3.10"
+          should-setup-pip-paths: 'false'
+          self-hosted-runner: 'false'
+
+      - name: Setup Python 3.12
+        uses: ./openvino/.github/actions/setup_python
+        with:
+          version: "3.12"
+          should-setup-pip-paths: 'false'
+          self-hosted-runner: 'false'
+
+      - name: Build additional Python wheels
+        run: |
+          for py_version in "3.9" "3.10" "3.11" "3.12"
+          do            
+            python_exec_path=$(python$py_version -c "import sys; print(sys.executable)")
+            $python_exec_path -m pip install -r ${{ env.OPENVINO_REPO }}/src/bindings/python/wheel/requirements-dev.txt
+            
+            cmake -DPython3_EXECUTABLE=$python_exec_path -DENABLE_WHEEL=ON -DOpenVINODeveloperPackage_DIR=${{ env.BUILD_DIR }} -S ${{ env.OPENVINO_REPO }}/src/bindings/python -B ${{ github.workspace }}/py$py_version
+            cmake --build ${{ github.workspace }}/py$py_version --parallel
+            cmake --install ${{ github.workspace }}/py$py_version --config ${{ env.CMAKE_BUILD_TYPE }} --prefix ${{ env.INSTALL_WHEELS_DIR }} --component python_wheels
+          done
+
+      # Setup Python 3.11 as the default one
+      - name: Setup Python ${{ env.PYTHON_VERSION }}
+        uses: ./openvino/.github/actions/setup_python
+        with:
+          version: ${{ env.PYTHON_VERSION }}
+          should-setup-pip-paths: 'false'
+          self-hosted-runner: 'false'
+
       - name: Cmake & Build - OpenVINO Contrib
         run: |
           cmake \
@@ -199,6 +241,7 @@ jobs:
           cmake --build ${{ env.BUILD_DIR }} --parallel $(nproc)
 
           cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR_JS }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake
+
       #
       # Upload build artifacts
       #
@@ -210,7 +253,7 @@ jobs:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
           if-no-files-found: 'error'
-          
+
       - name: Upload openvino wheels
         uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
@@ -270,6 +313,19 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       os: 'mac_13'
 
+  Python_API_Tests:
+    name: Python API tests
+    needs: [ Build, Smart_CI ]
+    uses: ./.github/workflows/job_python_api_tests.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ '3.9', '3.10', '3.11', '3.12' ]
+    with:
+      runner: 'macos-13'
+      python-version: ${{ matrix.python-version }}
+    if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test
+
   Python_Unit_Tests:
     name: Python unit tests
     needs: [ Build, Smart_CI ]
diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml
index 0708a844fe6b8b..d3fb10082adfd4 100644
--- a/.github/workflows/mac_arm64.yml
+++ b/.github/workflows/mac_arm64.yml
@@ -151,6 +151,7 @@ jobs:
             -DENABLE_CPPLINT=OFF \
             -DENABLE_NCC_STYLE=OFF \
             -DENABLE_TESTS=ON \
+            -DENABLE_WHEEL=OFF \
             -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \
             -DENABLE_STRICT_DEPENDENCIES=OFF \
             -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \
@@ -168,7 +169,6 @@ jobs:
         run: |
           cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake
           cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake
-          cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake
 
       - name: Pack Artifacts
         run: |
@@ -180,6 +180,48 @@ jobs:
             tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz
           popd
 
+      # Setup additional Python versions for wheels building
+      - name: Setup Python 3.9
+        uses: ./openvino/.github/actions/setup_python
+        with:
+          version: "3.9"
+          should-setup-pip-paths: 'false'
+          self-hosted-runner: 'false'
+
+      - name: Setup Python 3.10
+        uses: ./openvino/.github/actions/setup_python
+        with:
+          version: "3.10"
+          should-setup-pip-paths: 'false'
+          self-hosted-runner: 'false'
+
+      - name: Setup Python 3.12
+        uses: ./openvino/.github/actions/setup_python
+        with:
+          version: "3.12"
+          should-setup-pip-paths: 'false'
+          self-hosted-runner: 'false'
+
+      - name: Build additional Python wheels
+        run: |
+          for py_version in "3.9" "3.10" "3.11" "3.12"
+          do            
+            python_exec_path=$(python$py_version -c "import sys; print(sys.executable)")
+            $python_exec_path -m pip install -r ${{ env.OPENVINO_REPO }}/src/bindings/python/wheel/requirements-dev.txt
+            
+            cmake -DPython3_EXECUTABLE=$python_exec_path -DENABLE_WHEEL=ON -DOpenVINODeveloperPackage_DIR=${{ env.BUILD_DIR }} -S ${{ env.OPENVINO_REPO }}/src/bindings/python -B ${{ github.workspace }}/py$py_version
+            cmake --build ${{ github.workspace }}/py$py_version --parallel
+            cmake --install ${{ github.workspace }}/py$py_version --config ${{ env.CMAKE_BUILD_TYPE }} --prefix ${{ env.INSTALL_WHEELS_DIR }} --component python_wheels
+          done
+
+      # Setup Python 3.11 as the default one
+      - name: Setup Python ${{ env.PYTHON_VERSION }}
+        uses: ./openvino/.github/actions/setup_python
+        with:
+          version: ${{ env.PYTHON_VERSION }}
+          should-setup-pip-paths: 'false'
+          self-hosted-runner: 'false'
+
       - name: Cmake & Build - OpenVINO Contrib
         run: |
           cmake \
@@ -279,6 +321,19 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.11'
 
+  Python_API_Tests:
+    name: Python API tests
+    needs: [ Build, Smart_CI ]
+    uses: ./.github/workflows/job_python_api_tests.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ '3.9', '3.10', '3.11', '3.12' ]
+    with:
+      runner: 'macos-13-xlarge'
+      python-version: ${{ matrix.python-version }}
+    if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test
+
   TensorFlow_Layer_Tests:
     name: TensorFlow Layer Tests
     needs: [ Build, Smart_CI, Openvino_tokenizers ]
diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml
index f4caec8b2458a0..4fc93d73213f78 100644
--- a/.github/workflows/ubuntu_22.yml
+++ b/.github/workflows/ubuntu_22.yml
@@ -300,6 +300,16 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.11'
 
+  Python_API_Tests:
+    name: Python API tests
+    needs: [ Docker, Build, Smart_CI ]
+    uses: ./.github/workflows/job_python_api_tests.yml
+    with:
+      runner: 'aks-linux-4-cores-16gb'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
+      python-version: '3.11'
+    if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test
+
   TensorFlow_Layer_Tests:
     name: TensorFlow Layer Tests
     needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ]
diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml
index d874e06a189232..1ad3951ecd3347 100644
--- a/.github/workflows/ubuntu_24.yml
+++ b/.github/workflows/ubuntu_24.yml
@@ -134,6 +134,16 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.12'
 
+  Python_API_Tests:
+    name: Python API tests
+    needs: [ Docker, Build, Smart_CI ]
+    uses: ./.github/workflows/job_python_api_tests.yml
+    with:
+      runner: 'aks-linux-4-cores-16gb'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}'
+      python-version: '3.12'
+    if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test
+
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
     needs: [ Docker, Build, Smart_CI ]

From 67f253764c4d0a9b7ab5a8f9706d063e488d7b5b Mon Sep 17 00:00:00 2001
From: Alina Kladieva <alina.kladieva@intel.com>
Date: Mon, 9 Dec 2024 19:27:32 +0100
Subject: [PATCH 18/23] [GHA][ov-provider] Exclude custom release packages from
 matching (#27979)

To filter out automatically picking unwanted custom release builds like
https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.5/windows_vc_mt

Test run:
https://github.com/openvinotoolkit/openvino_tokenizers/actions/runs/12237578864/job/34133648815?pr=338
(now the regular "windows" package is picked)

Signed-off-by: Alina Kladieva <alina.kladieva@intel.com>
---
 .github/actions/openvino_provider/get_s3_package.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/actions/openvino_provider/get_s3_package.py b/.github/actions/openvino_provider/get_s3_package.py
index df253a422421ec..02ea99cb2f3403 100644
--- a/.github/actions/openvino_provider/get_s3_package.py
+++ b/.github/actions/openvino_provider/get_s3_package.py
@@ -54,6 +54,10 @@ def main(product, version_pattern, platform, arch, folder):
     matching_files = filter_files_by_criteria(all_files, product, version_pattern, platform, arch, folder)
     if matching_files:
         logger.info(f"Matching packages: {sorted(matching_files)}")
+        if len(matching_files) > 1:
+            custom_release_build_pattern = fr".*/{version_pattern}/(linux_|windows_|macos_).*/.*"
+            # Exclude custom release builds, if any, from matches
+            matching_files = [file for file in matching_files if not re.search(custom_release_build_pattern, file)]
         package_url = f"https://storage.openvinotoolkit.org{sorted(matching_files)[-1]}"
         logger.info(f"Returning package URL: {package_url}")
         action_utils.set_github_output("package_url", package_url)

From f0da7075169b97f6523d8f465cbb6ab76f995324 Mon Sep 17 00:00:00 2001
From: Alina Kladieva <alina.kladieva@intel.com>
Date: Tue, 10 Dec 2024 08:42:26 +0100
Subject: [PATCH 19/23] [tests/requirements_pytorch] Temporarily fix
 optimum-intel version on last stable commit (#27985)

      There are failures with newer commits, e.g.
https://github.com/openvinotoolkit/openvino/actions/runs/12240792041/job/34146426674

---------

Signed-off-by: Alina Kladieva <alina.kladieva@intel.com>
---
 .github/components.yml     | 1 +
 tests/requirements_pytorch | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/components.yml b/.github/components.yml
index 8de51a2ced3343..74247e1f051cd5 100644
--- a/.github/components.yml
+++ b/.github/components.yml
@@ -149,6 +149,7 @@ PyTorch_FE:
   build:
     - CPU
     - Python_API
+    - TOKENIZERS # PyTorch_FE tests depend on tokenizers build
 
 JAX_FE:
   revalidate:
diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch
index be304155e2afc0..f42deb81839883 100644
--- a/tests/requirements_pytorch
+++ b/tests/requirements_pytorch
@@ -44,7 +44,7 @@ super-image==0.1.7
 huggingface-hub==0.25.2
 
 # use latest released version once it's available
-git+https://github.com/huggingface/optimum-intel.git@main; python_version < "3.12"
+git+https://github.com/huggingface/optimum-intel.git@5c735487d4bd3dd8d7dccb242d8d5988e7dd4069; python_version < "3.12"
 # set 'export HF_HUB_ENABLE_HF_TRANSFER=1' to benefits from hf_transfer
 hf_transfer==0.1.8
 

From 9e6dfed16a29ddfcddba78f2d1b895d647cd2ec9 Mon Sep 17 00:00:00 2001
From: Pavel Durandin <pavel.durandin@intel.com>
Date: Tue, 10 Dec 2024 11:10:12 +0400
Subject: [PATCH 20/23] [GPU] Fix tests errors, phase 7 (#27953)

### Details:
 - Fixes in unit tests failures
---
 .../tests/unit/test_cases/convolution_gpu_test.cpp         | 7 ++++---
 .../intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp       | 2 +-
 .../intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp   | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
index 5d01d448dcfc64..f0243f055c3670 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
@@ -10784,11 +10784,12 @@ TEST_P(conv_dyn_test, convolution_gpu_fsv16_1x1_no_bias) {
 
     auto is_weight_1x1 = (p.wei_shape[p.wei_shape.size() - 1] == 1 && p.wei_shape[p.wei_shape.size() - 2] == 1);
     auto is_valid_output = p.wei_shape[0] % 16 == 0;
-    auto is_valid_strid = p.stride[0] == 1 && p.stride[1] == 1;
-    auto is_valid_padding = p.pad_begin[0] == 0 && p.pad_begin[1] == 0 && p.pad_end[0] == 0 && p.pad_end[1] == 0;
+    auto is_valid_strid = std::all_of(p.stride.begin(), p.stride.end(), [](size_t i) { return i == 1; });
+    auto is_valid_padding = std::all_of(p.pad_begin.begin(), p.pad_begin.end(), [](int i) { return i == 0; })
+        && std::all_of(p.pad_end.begin(), p.pad_end.end(), [](int i) { return i == 0; });
 
     if (!is_weight_1x1 || !is_valid_output || !is_valid_strid || !is_valid_padding) {
-        std::cout << "[ SKIPPED ] The test is skipped (is_weight_1x1:" << is_weight_1x1 << ", is_valid_output" << is_valid_output
+        std::cout << "[ SKIPPED ] The test is skipped (is_weight_1x1: " << is_weight_1x1 << ", is_valid_output: " << is_valid_output
                   << ", is_valid_strid: " << is_valid_strid << ", is_valid_padding: " << is_valid_padding << std::endl;
         ASSERT_EQ(1, 1);
         return;
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp
index 3099c8dad5d9d3..5d78cdec028724 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/dft_gpu_test.cpp
@@ -1963,7 +1963,7 @@ const std::vector<dft_params> IRDFT_params_4d = {
     {{2, 10, 6, 2}, {2, 10, 10}, {1, 2}, {}, expected_rdft2d_results, rinput_data},
     {{2, 10, 6, 2}, {2, 10, 10}, {1, 2}, {10, 10}, expected_rdft2d_results, rinput_data},
     {{2, 5, 7, 2}, {2, 5, 12}, {1, 2}, {5, 12}, expected_rdft2d_results_2, expected_irdft2d_results_2},
-    {{2, 10, 6, 2}, {2, 10, 10}, {0, 1, 2}, {10, 10}, expected_rdft3d_results, rinput_data},
+    {{2, 10, 6, 2}, {2, 10, 10}, {0, 1, 2}, {10, 10, 10}, expected_rdft3d_results, rinput_data},
     {{2, 10, 6, 2}, {4, 5, 12}, {0, 1, 2}, {4, 5, 12}, expected_rdft3d_results, expected_irdft3d_results_2},
 };
 const std::vector<dft_params> IRDFT_params_5d = extendByOneDimension(IRDFT_params_4d);
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp
index 461474335e903a..324f90faf0b70e 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/pooling_gpu_test.cpp
@@ -1244,7 +1244,7 @@ static void generic_average_wo_padding_test(format fmt, tensor output, tensor in
         tpl.add(reorder("reorder", input_info("in"), input_mem->get_layout().with_padding((padding) off.sizes())));
         pool_in = "reorder";
     }
-    tpl.add(pooling("pool", input_info(pool_in), pooling_mode::average_no_padding, window, stride, offset));
+    tpl.add(pooling("pool", input_info(pool_in), pooling_mode::average_no_padding, window, stride, offset, offset));
 
     auto cfg = get_test_default_config(get_test_engine());
     cfg.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"pool", {format::any, "", impl_types::ocl}}}));

From 6a4ba4695191b14c215e4613b5327707c0e33008 Mon Sep 17 00:00:00 2001
From: Egor Duplenskii <egor.duplensky@gmail.com>
Date: Tue, 10 Dec 2024 08:11:22 +0100
Subject: [PATCH 21/23] [CPU] Introduce FullyConnected, FCQuantized,
 FCCompressed, Placeholder (#26239)

### Details:
1.  Introduce the following operations to the internal opset

    * `FullyConnected` (`MatMul` with transposed constant second input)
* `FullyConnectedCompressed` (`FullyConnected` with weights compression)
* `FullyConnectedQuantizedLegacy` (`FullyConnected` with quantized
activations and weights and dequantize scale and zero point pulled
through the Op by LPT)
* `FullyConnectedQuantized` (`FullyConnected` with quantization scales
and zero points on activation, weights and outputs). Planned to be used
in scope of dynamic quantization. Can be used for a static quantization
as well in the future.
    * Unused inputs are presented as `Constant` input with `Shape{0}`

2. The following transformations were added / updated:
* `ConvertFullyConnectedToFullyConnectedCompressed` (replaces
proprietary ~`FuseFCAndWeightsDecompression`~)
* `ConvertFCToFCQuantizedLegacy` replaces proprietary
~`FuseConvMatmulFCDeconvAndDQScales`~
* `FullyConnectedBiasFusion` (added into CPU folder for now, needs to be
checked and review by GPU team before adaptation to internal opset).
Replaces proprietary ~`FuseConvolutionMatMulDeconvAndBias`~
* `ConvertMatMulToFC` updated to use `ov::op::internal:FullyConnected`,
planned to be moved to internal opset after review from GPU team

### Todo
 - [x] Clean up debug code
 - [x] Clean up extra cmake targets
 - [x] Perf regression check

### Tickets:
 - 149923
---
 .../include/ov_ops/fully_connected.hpp        |  46 +++
 .../ov_ops/fully_connected_compressed.hpp     |  41 +++
 .../ov_ops/fully_connected_quantized.hpp      |  39 +++
 .../fully_connected_quantized_legacy.hpp      |  41 +++
 .../convert_fc_to_compressed.hpp              |  29 ++
 .../convert_fc_to_quantized_legacy.hpp        |  22 ++
 .../src/ov_ops/fully_connected.cpp            |  62 ++++
 .../src/ov_ops/fully_connected_compressed.cpp |  63 ++++
 .../src/ov_ops/fully_connected_quantized.cpp  |  59 ++++
 .../fully_connected_quantized_legacy.cpp      |  71 +++++
 .../convert_fc_to_compressed.cpp              | 181 +++++++++++
 .../convert_fc_to_quantized_legacy.cpp        |  77 +++++
 src/frontends/ir/src/ir_deserializer.cpp      |   5 +-
 src/plugins/intel_cpu/src/cpu_types.cpp       |   7 +
 src/plugins/intel_cpu/src/cpu_types.h         |   6 +
 .../intel_cpu/src/dnnl_postops_composer.cpp   | 105 +++++--
 .../intel_cpu/src/dnnl_postops_composer.h     |   3 +-
 src/plugins/intel_cpu/src/edge.cpp            |   8 +
 src/plugins/intel_cpu/src/extension.cpp       |  10 +-
 src/plugins/intel_cpu/src/graph_optimizer.cpp | 262 +---------------
 src/plugins/intel_cpu/src/graph_optimizer.h   |   1 -
 .../src/memory_desc/empty_memory_desc.h       |   4 +-
 src/plugins/intel_cpu/src/node.cpp            |   3 +-
 .../executors/acl/acl_fullyconnected.cpp      |  24 +-
 .../dnnl/dnnl_convolution_primitive.cpp       |   3 +-
 .../dnnl/dnnl_fullyconnected_primitive.cpp    |  60 ++--
 .../dnnl/dnnl_fullyconnected_primitive.hpp    |   7 -
 .../executors/dnnl/dnnl_matmul_primitive.cpp  |  10 +-
 .../src/nodes/executors/executor_config.hpp   |   1 -
 .../src/nodes/executors/executor_factory.hpp  |   1 -
 .../nodes/executors/fullyconnected_config.hpp |   7 +-
 .../fullyconnected_implementations.cpp        |   3 +-
 .../src/nodes/executors/matmul_config.hpp     |   1 -
 .../src/nodes/executors/memory_arguments.hpp  |   8 +-
 .../src/nodes/executors/mlas/mlas_gemm.cpp    |  43 +--
 .../intel_cpu/src/nodes/fullyconnected.cpp    | 237 +++++++++------
 .../intel_cpu/src/nodes/fullyconnected.h      |  34 ++-
 src/plugins/intel_cpu/src/nodes/input.cpp     |  50 ++--
 src/plugins/intel_cpu/src/nodes/input.h       |   2 +-
 src/plugins/intel_cpu/src/nodes/reference.cpp |   2 +-
 .../shape_inference/custom/fullyconnected.cpp |   4 +-
 .../cpu_opset/common/op/fully_connected.cpp   |  79 -----
 .../cpu_opset/common/op/fully_connected.hpp   |  39 ---
 .../common/pass/convert_matmul_to_fc.cpp      |  32 +-
 .../common/pass/convert_matmul_to_fc.hpp      |   2 +-
 .../common/pass/convert_to_power_static.cpp   |  22 +-
 .../cpu_opset/common/pass/fc_bias_fusion.cpp  |  79 +++++
 .../cpu_opset/common/pass/fc_bias_fusion.hpp  |  19 ++
 .../pass/move_fc_reshape_to_weights.cpp       |   5 +-
 .../cpu_opset/common/pass/split_fc.cpp        | 207 -------------
 .../cpu_opset/common/pass/split_fc.hpp        |  81 -----
 .../convert_to_cpu_specific_opset.hpp         |  43 ++-
 .../transformation_pipeline.cpp               |   3 +-
 .../intel_cpu/src/transformations/utils.cpp   |   4 +-
 src/plugins/intel_cpu/src/utils/cpu_utils.hpp |  31 ++
 .../src/utils/debug_capabilities.cpp          |   5 +-
 .../intel_cpu/src/utils/debug_capabilities.h  |   7 +
 .../instances/arm/matmul.cpp                  |   3 -
 .../src/x64/matmul_weights_decompression.cpp  |   2 +-
 .../custom_shape_infer/fullconnect.cpp        |  70 ++++-
 .../transformations/convert_matmul_test.cpp   | 231 +++++++++------
 .../move_fc_reshape_to_weights.cpp            |   9 +-
 .../unit/transformations/split_fc_test.cpp    | 280 ------------------
 .../common_test_utils/src/ov_test_utils.cpp   |   1 +
 64 files changed, 1562 insertions(+), 1334 deletions(-)
 create mode 100644 src/common/transformations/include/ov_ops/fully_connected.hpp
 create mode 100644 src/common/transformations/include/ov_ops/fully_connected_compressed.hpp
 create mode 100644 src/common/transformations/include/ov_ops/fully_connected_quantized.hpp
 create mode 100644 src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp
 create mode 100644 src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp
 create mode 100644 src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp
 create mode 100644 src/common/transformations/src/ov_ops/fully_connected.cpp
 create mode 100644 src/common/transformations/src/ov_ops/fully_connected_compressed.cpp
 create mode 100644 src/common/transformations/src/ov_ops/fully_connected_quantized.cpp
 create mode 100644 src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp
 create mode 100644 src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp
 create mode 100644 src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp
 delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.cpp
 delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.hpp
 create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp
 create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp
 delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.cpp
 delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.hpp
 delete mode 100644 src/plugins/intel_cpu/tests/unit/transformations/split_fc_test.cpp

diff --git a/src/common/transformations/include/ov_ops/fully_connected.hpp b/src/common/transformations/include/ov_ops/fully_connected.hpp
new file mode 100644
index 00000000000000..6f33b5963ffaf8
--- /dev/null
+++ b/src/common/transformations/include/ov_ops/fully_connected.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/node.hpp"
+#include "openvino/op/op.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+class TRANSFORMATIONS_API FullyConnected : public ov::op::Op {
+public:
+    OPENVINO_OP("FullyConnected", "ie_internal_opset");
+
+    FullyConnected() = default;
+
+    FullyConnected(const ov::Output<Node>& A,
+                   const ov::Output<Node>& B,
+                   const ov::Output<Node>& bias,
+                   const ov::element::Type output_type = ov::element::undefined);
+
+    FullyConnected(const ov::Output<Node>& A,
+                   const ov::Output<Node>& B,
+                   const ov::element::Type output_type = ov::element::undefined);
+
+    bool visit_attributes(ov::AttributeVisitor& visitor) override;
+
+    ov::element::Type get_output_type() const {
+        return m_output_type;
+    }
+
+    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
+    void validate_and_infer_types() override;
+
+protected:
+    ov::element::Type m_output_type;
+};
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/common/transformations/include/ov_ops/fully_connected_compressed.hpp b/src/common/transformations/include/ov_ops/fully_connected_compressed.hpp
new file mode 100644
index 00000000000000..d363a339406070
--- /dev/null
+++ b/src/common/transformations/include/ov_ops/fully_connected_compressed.hpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/node.hpp"
+#include "openvino/op/op.hpp"
+#include "ov_ops/fully_connected.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+class TRANSFORMATIONS_API FullyConnectedCompressed : public FullyConnected {
+public:
+    OPENVINO_OP("FullyConnectedCompressed", "ie_internal_opset", FullyConnected);
+
+    FullyConnectedCompressed() = default;
+
+    FullyConnectedCompressed(const ov::Output<Node>& X,
+                             const ov::Output<Node>& W,
+                             const ov::Output<Node>& bias,
+                             const ov::Output<Node>& weight_scales,
+                             const ov::Output<Node>& weight_zero_points,
+                             const ov::element::Type output_type = ov::element::undefined);
+
+    FullyConnectedCompressed(const ov::Output<Node>& X,
+                             const ov::Output<Node>& W,
+                             const ov::Output<Node>& bias,
+                             const ov::Output<Node>& weight_scales,
+                             const ov::element::Type output_type = ov::element::undefined);
+
+    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
+    void validate_and_infer_types() override;
+};
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp b/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp
new file mode 100644
index 00000000000000..6eceed0abdef78
--- /dev/null
+++ b/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/node.hpp"
+#include "openvino/op/op.hpp"
+#include "ov_ops/fully_connected.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+class TRANSFORMATIONS_API FullyConnectedQuantized : public FullyConnected {
+public:
+    OPENVINO_OP("FullyConnectedQuantized", "ie_internal_opset", FullyConnected);
+
+    FullyConnectedQuantized() = default;
+
+    FullyConnectedQuantized(const ov::Output<Node>& X,
+                            const ov::Output<Node>& W,
+                            const ov::Output<Node>& bias,
+                            const ov::Output<Node>& weight_scales,
+                            const ov::Output<Node>& weight_zero_points,
+                            const ov::Output<Node>& input_scales,
+                            const ov::Output<Node>& input_zero_points,
+                            const ov::Output<Node>& output_scales,
+                            const ov::Output<Node>& output_zero_points,
+                            const ov::element::Type output_type = ov::element::undefined);
+
+    void validate_and_infer_types() override;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+};
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp b/src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp
new file mode 100644
index 00000000000000..2c68ec4dc365f9
--- /dev/null
+++ b/src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/node.hpp"
+#include "openvino/op/op.hpp"
+#include "ov_ops/fully_connected.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+class TRANSFORMATIONS_API FullyConnectedQuantizedLegacy : public FullyConnected {
+public:
+    OPENVINO_OP("FullyConnectedQuantizedLegacy", "ie_internal_opset", FullyConnected);
+
+    FullyConnectedQuantizedLegacy() = default;
+
+    FullyConnectedQuantizedLegacy(const ov::Output<Node>& X,
+                                  const ov::Output<Node>& W,
+                                  const ov::Output<Node>& bias,
+                                  const ov::Output<Node>& deq_scales,
+                                  const ov::Output<Node>& deq_zero_points,
+                                  const ov::element::Type output_type = ov::element::undefined);
+
+    FullyConnectedQuantizedLegacy(const ov::Output<Node>& X,
+                                  const ov::Output<Node>& W,
+                                  const ov::Output<Node>& bias,
+                                  const ov::Output<Node>& deq_scales,
+                                  const ov::element::Type output_type = ov::element::undefined);
+
+    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
+    void validate_and_infer_types() override;
+};
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp
new file mode 100644
index 00000000000000..1b6fcfb2bb3684
--- /dev/null
+++ b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "ov_ops/fully_connected.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+class TRANSFORMATIONS_API ConvertFullyConnectedToFullyConnectedCompressed;
+
+}  // namespace pass
+}  // namespace ov
+
+class ov::pass::ConvertFullyConnectedToFullyConnectedCompressed : public ov::pass::MatcherPass {
+public:
+    using SupportsPredicate =
+        std::function<bool(const std::shared_ptr<ov::op::internal::FullyConnected>&, size_t, size_t, size_t)>;
+
+    OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedCompressed", "0");
+    ConvertFullyConnectedToFullyConnectedCompressed(const std::vector<ov::element::Type>& supported_activation_types,
+                                                    const std::vector<ov::element::Type>& supported_weights_types,
+                                                    SupportsPredicate supports_config = nullptr,
+                                                    bool convert_u4zp_to_u8 = false);
+};
diff --git a/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp
new file mode 100644
index 00000000000000..88990f92cb573c
--- /dev/null
+++ b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+class TRANSFORMATIONS_API ConvertFCToFCQuantizedLegacy;
+
+}  // namespace pass
+}  // namespace ov
+
+class ov::pass::ConvertFCToFCQuantizedLegacy : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedQuantized", "0");
+    ConvertFCToFCQuantizedLegacy();
+};
diff --git a/src/common/transformations/src/ov_ops/fully_connected.cpp b/src/common/transformations/src/ov_ops/fully_connected.cpp
new file mode 100644
index 00000000000000..3fa609362b999c
--- /dev/null
+++ b/src/common/transformations/src/ov_ops/fully_connected.cpp
@@ -0,0 +1,62 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ov_ops/fully_connected.hpp"
+
+#include <memory>
+
+#include "matmul_shape_inference.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+FullyConnected::FullyConnected(const ov::Output<Node>& A,
+                               const ov::Output<Node>& B,
+                               const ov::Output<Node>& bias,
+                               const ov::element::Type output_type)
+    : Op({A, B, bias}),
+      m_output_type(output_type) {
+    validate_and_infer_types();
+}
+
+FullyConnected::FullyConnected(const ov::Output<Node>& A,
+                               const ov::Output<Node>& B,
+                               const ov::element::Type output_type)
+    : FullyConnected(A, B, std::make_shared<v0::Constant>(element::undefined, Shape{0}), output_type) {}
+
+bool FullyConnected::visit_attributes(ov::AttributeVisitor& visitor) {
+    visitor.on_attribute("output_type", m_output_type);
+    return true;
+}
+
+std::shared_ptr<ov::Node> FullyConnected::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    check_new_args_count(this, new_args);
+
+    return std::make_shared<FullyConnected>(new_args.at(0), new_args.at(1), new_args.at(2), m_output_type);
+}
+
+void FullyConnected::validate_and_infer_types() {
+    const auto input_size = get_input_size();
+    NODE_VALIDATION_CHECK(this,
+                          input_size >= 3,
+                          "Number of inputs is incorrect. Current value is: ",
+                          input_size,
+                          ", expected at least 3.");
+
+    ov::op::v0::MatMul op;
+    op.set_transpose_a(false);
+    op.set_transpose_b(true);
+
+    auto out_shapes =
+        ov::op::v0::shape_infer(&op,
+                                std::vector<ov::PartialShape>{get_input_partial_shape(0), get_input_partial_shape(1)});
+
+    auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type;
+    set_output_type(0, output_type, out_shapes[0]);
+}
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/common/transformations/src/ov_ops/fully_connected_compressed.cpp b/src/common/transformations/src/ov_ops/fully_connected_compressed.cpp
new file mode 100644
index 00000000000000..e0bb13042ea6ff
--- /dev/null
+++ b/src/common/transformations/src/ov_ops/fully_connected_compressed.cpp
@@ -0,0 +1,63 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ov_ops/fully_connected_compressed.hpp"
+
+#include <memory>
+
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/op/constant.hpp"
+#include "ov_ops/fully_connected.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output<Node>& X,
+                                                   const ov::Output<Node>& W,
+                                                   const ov::Output<Node>& bias,
+                                                   const ov::Output<Node>& weight_scales,
+                                                   const ov::Output<Node>& weight_zero_points,
+                                                   const ov::element::Type output_type)
+    : FullyConnected(X, W, bias, output_type) {
+    set_argument(3, weight_scales);
+    set_argument(4, weight_zero_points);
+    validate_and_infer_types();
+}
+
+FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output<Node>& X,
+                                                   const ov::Output<Node>& W,
+                                                   const ov::Output<Node>& bias,
+                                                   const ov::Output<Node>& weight_scales,
+                                                   const ov::element::Type output_type)
+    : FullyConnectedCompressed(X,
+                               W,
+                               bias,
+                               weight_scales,
+                               std::make_shared<v0::Constant>(element::undefined, Shape{0}),
+                               output_type) {}
+
+std::shared_ptr<ov::Node> FullyConnectedCompressed::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    check_new_args_count(this, new_args);
+
+    return std::make_shared<FullyConnectedCompressed>(new_args.at(0),
+                                                      new_args.at(1),
+                                                      new_args.at(2),
+                                                      new_args.at(3),
+                                                      new_args.at(4),
+                                                      m_output_type);
+}
+
+// @todo finalize validate_and_infer_types
+void FullyConnectedCompressed::validate_and_infer_types() {
+    const auto input_size = get_input_size();
+
+    NODE_VALIDATION_CHECK(this, input_size == 5, "Number of inputs is incorrect. Current value is: ", input_size);
+
+    FullyConnected::validate_and_infer_types();
+}
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp b/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp
new file mode 100644
index 00000000000000..3f06e14834f7d1
--- /dev/null
+++ b/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ov_ops/fully_connected_quantized.hpp"
+
+#include "openvino/core/type/element_type.hpp"
+#include "ov_ops/fully_connected.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+FullyConnectedQuantized::FullyConnectedQuantized(const ov::Output<Node>& X,
+                                                 const ov::Output<Node>& W,
+                                                 const ov::Output<Node>& bias,
+                                                 const ov::Output<Node>& weight_scales,
+                                                 const ov::Output<Node>& weight_zero_points,
+                                                 const ov::Output<Node>& input_scales,
+                                                 const ov::Output<Node>& input_zero_points,
+                                                 const ov::Output<Node>& output_scales,
+                                                 const ov::Output<Node>& output_zero_points,
+                                                 const ov::element::Type output_type)
+    : FullyConnected(X, W, bias, output_type) {
+    set_argument(3, weight_scales);
+    set_argument(4, weight_zero_points);
+    set_argument(5, input_scales);
+    set_argument(6, input_zero_points);
+    set_argument(7, output_scales);
+    set_argument(8, output_zero_points);
+    validate_and_infer_types();
+}
+
+std::shared_ptr<ov::Node> FullyConnectedQuantized::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    check_new_args_count(this, new_args);
+
+    return std::make_shared<FullyConnectedQuantized>(new_args.at(0),
+                                                     new_args.at(1),
+                                                     new_args.at(2),
+                                                     new_args.at(3),
+                                                     new_args.at(4),
+                                                     new_args.at(5),
+                                                     new_args.at(6),
+                                                     new_args.at(7),
+                                                     new_args.at(8),
+                                                     m_output_type);
+}
+
+// @todo finalize validate_and_infer_types
+void FullyConnectedQuantized::validate_and_infer_types() {
+    const auto input_size = get_input_size();
+    NODE_VALIDATION_CHECK(this, input_size == 9, "Number of inputs is incorrect. Current value is: ", input_size);
+
+    FullyConnected::validate_and_infer_types();
+}
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp b/src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp
new file mode 100644
index 00000000000000..42df0980086199
--- /dev/null
+++ b/src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ov_ops/fully_connected_quantized_legacy.hpp"
+
+#include <memory>
+
+#include "matmul_shape_inference.hpp"
+#include "openvino/core/type/element_type.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+FullyConnectedQuantizedLegacy::FullyConnectedQuantizedLegacy(const ov::Output<Node>& X,
+                                                             const ov::Output<Node>& W,
+                                                             const ov::Output<Node>& bias,
+                                                             const ov::Output<Node>& deq_scales,
+                                                             const ov::Output<Node>& deq_zero_points,
+                                                             const ov::element::Type output_type)
+    : FullyConnected(X, W, bias, output_type) {
+    set_argument(3, deq_scales);
+    set_argument(4, deq_zero_points);
+    validate_and_infer_types();
+}
+
+FullyConnectedQuantizedLegacy::FullyConnectedQuantizedLegacy(const ov::Output<Node>& X,
+                                                             const ov::Output<Node>& W,
+                                                             const ov::Output<Node>& bias,
+                                                             const ov::Output<Node>& deq_scales,
+                                                             const ov::element::Type output_type)
+    : FullyConnectedQuantizedLegacy(X,
+                                    W,
+                                    bias,
+                                    deq_scales,
+                                    std::make_shared<v0::Constant>(element::undefined, Shape{0}),
+                                    output_type) {}
+
+std::shared_ptr<ov::Node> FullyConnectedQuantizedLegacy::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    check_new_args_count(this, new_args);
+
+    return std::make_shared<FullyConnectedQuantizedLegacy>(new_args.at(0),
+                                                           new_args.at(1),
+                                                           new_args.at(2),
+                                                           new_args.at(3),
+                                                           new_args.at(4),
+                                                           m_output_type);
+}
+
+// @todo finalize validate_and_infer_types
+void FullyConnectedQuantizedLegacy::validate_and_infer_types() {
+    const auto input_size = get_input_size();
+
+    NODE_VALIDATION_CHECK(this, input_size == 5, "Number of inputs is incorrect. Current value is: ", input_size);
+
+    ov::op::v0::MatMul op;
+    op.set_transpose_a(false);
+    op.set_transpose_b(true);
+
+    auto out_shapes =
+        ov::op::v0::shape_infer(&op,
+                                std::vector<ov::PartialShape>{get_input_partial_shape(0), get_input_partial_shape(1)});
+
+    auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type;
+    set_output_type(0, output_type, out_shapes[0]);
+}
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp
new file mode 100644
index 00000000000000..87c3b669d98c6d
--- /dev/null
+++ b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp
@@ -0,0 +1,181 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/op_conversions/convert_fc_to_compressed.hpp"
+
+#include <algorithm>
+#include <memory>
+
+#include "openvino/core/rt_info.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/subtract.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/pass/pattern/op/or.hpp"
+#include "openvino/pass/pattern/op/pattern.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "ov_ops/fully_connected.hpp"
+#include "ov_ops/fully_connected_compressed.hpp"
+#include "transformations/utils/utils.hpp"
+
+ov::pass::ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed(
+    const std::vector<ov::element::Type>& supported_activation_types,
+    const std::vector<ov::element::Type>& supported_weights_types,
+    SupportsPredicate supports_config,
+    bool convert_u4zp_to_u8) {
+    using namespace ov::pass::pattern;
+
+    auto reshape_3d_to_2d = [](const ov::Output<ov::Node>& output) {
+        auto in_ps = output.get_node()->get_input_partial_shape(0);
+        auto out_ps = output.get_node()->get_output_partial_shape(0);
+        return in_ps.rank().is_static() && out_ps.rank().is_static() && in_ps.size() == 3 && out_ps.size() == 2;
+    };
+
+    auto activation_m = any_input(ov::pass::pattern::type_matches_any(supported_activation_types));
+    auto weights_m = wrap_type<ov::op::v0::Constant>(ov::pass::pattern::type_matches_any(supported_weights_types));
+    auto convert_m = wrap_type<ov::op::v0::Convert>({weights_m});
+
+    auto sub_const_m = wrap_type<ov::op::v0::Constant>();
+    auto sub_convert_const_m = wrap_type<ov::op::v0::Convert>({sub_const_m});
+    auto sub_with_convert_m = wrap_type<ov::op::v1::Subtract>({convert_m, sub_convert_const_m});
+    auto sub_no_convert_m = wrap_type<ov::op::v1::Subtract>({convert_m, sub_const_m});
+    auto subtract_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{sub_with_convert_m, sub_no_convert_m});
+
+    auto mul_const_m = wrap_type<ov::op::v0::Constant>();
+    auto mul_convert_const_m = wrap_type<ov::op::v0::Convert>({mul_const_m});
+    auto mul_scale_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{mul_const_m, mul_convert_const_m});
+
+    auto mul_with_sub_m = wrap_type<ov::op::v1::Multiply>({subtract_m, mul_scale_m});
+    auto mul_no_sub_m = wrap_type<ov::op::v1::Multiply>({convert_m, mul_scale_m});
+    auto mul_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{mul_with_sub_m, mul_no_sub_m});
+
+    auto reshape_const_m = wrap_type<ov::op::v0::Constant>();
+    auto reshape_m = wrap_type<ov::op::v1::Reshape>({mul_m, reshape_const_m}, reshape_3d_to_2d);
+
+    auto transpose_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{reshape_m, mul_m});
+    auto transpose_const_m = wrap_type<ov::op::v0::Constant>();
+    auto transpose_m = wrap_type<ov::op::v1::Transpose>({transpose_input, transpose_const_m});
+
+    auto bias_m = any_input();
+    auto weights_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{reshape_m, transpose_m, mul_m});
+    auto fully_connected_m = wrap_type<ov::op::internal::FullyConnected>({activation_m, weights_input_m, bias_m});
+
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        OPENVINO_ASSERT(pattern_map.count(fully_connected_m));
+        OPENVINO_ASSERT(pattern_map.count(mul_const_m));
+        OPENVINO_ASSERT(pattern_map.count(weights_m));
+        OPENVINO_ASSERT(pattern_map.count(bias_m));
+        OPENVINO_ASSERT(pattern_map.count(convert_m));
+        auto fc = std::dynamic_pointer_cast<ov::op::internal::FullyConnected>(
+            pattern_map.at(fully_connected_m).get_node_shared_ptr());
+        if (!fc || transformation_callback(fc)) {
+            return false;
+        }
+
+        bool has_transpose = pattern_map.count(transpose_m);
+        auto scale_shape = pattern_map.at(mul_const_m).get_shape();
+        bool grouped = std::count_if(scale_shape.begin(), scale_shape.end(), [](size_t d) {
+                           return d > 1;
+                       }) > 1;
+
+        auto weights_shape = fc->get_input_shape(1);
+        const size_t IC = *(weights_shape.rbegin());
+        const size_t OC = *(weights_shape.rbegin() + 1);
+
+        const size_t G = grouped ? (has_transpose ? *(scale_shape.rbegin() + 2) : *(scale_shape.rbegin() + 1)) : 1;
+
+        if (supports_config && !supports_config(fc, IC, OC, G))
+            return false;
+
+        auto reshape_const_to_2d = [has_transpose, grouped](std::shared_ptr<ov::Node> node) {
+            auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
+            OPENVINO_ASSERT(constant != nullptr);
+            ov::Shape current_shape = constant->get_shape();
+            if (current_shape.size() <= 2)
+                return constant;
+
+            OPENVINO_ASSERT(current_shape.size() == 3);
+
+            auto new_shape = (has_transpose || !grouped)
+                                 ? ov::Shape{current_shape[0] * current_shape[1], current_shape[2]}
+                                 : ov::Shape{current_shape[0], current_shape[1] * current_shape[2]};
+
+            return std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
+        };
+
+        auto convert_u4const_to_u8 = [convert_u4zp_to_u8](std::shared_ptr<ov::Node> node) -> std::shared_ptr<ov::Node> {
+            auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
+            if (constant->get_element_type() != ov::element::u4 || !convert_u4zp_to_u8)
+                return std::dynamic_pointer_cast<ov::Node>(constant);
+            return std::make_shared<ov::op::v0::Convert>(node, ov::element::u8);
+        };
+
+        const ov::Output<Node>& fc_input_a = fc->input_value(0);
+        const auto& scale = reshape_const_to_2d(pattern_map.at(mul_const_m).get_node_shared_ptr());
+        std::shared_ptr<ov::Node> optional_zero_point = nullptr;
+
+        const bool with_zero_point =
+            pattern_map.count(sub_no_convert_m) > 0 || pattern_map.count(sub_with_convert_m) > 0;
+        if (with_zero_point) {
+            // WA: Convert ZP to u8 for OneDNN case to avoid u4 reorder
+            optional_zero_point =
+                convert_u4const_to_u8(reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr()));
+        }
+
+        std::shared_ptr<ov::Node> fc_input_b = reshape_const_to_2d(pattern_map.at(weights_m).get_node_shared_ptr());
+        std::shared_ptr<ov::Node> fc_input_scale = scale;
+        std::shared_ptr<ov::Node> fc_input_zp = optional_zero_point;
+        std::shared_ptr<ov::Node> fc_input_bias = pattern_map.at(bias_m).get_node_shared_ptr();
+        std::vector<std::shared_ptr<ov::Node>> result_nodes = {};
+        if (has_transpose) {
+            const auto& transpose = pattern_map.at(transpose_m).get_node_shared_ptr();
+            std::shared_ptr<ov::Node> transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr();
+            if (ov::shape_size(transpose_const->get_shape()) != fc_input_b->get_output_partial_shape(0).size()) {
+                std::vector<int32_t> new_order(fc_input_b->get_output_partial_shape(0).size());
+                std::iota(new_order.begin(), new_order.end(), 0);
+                std::swap(new_order[new_order.size() - 1], new_order[new_order.size() - 2]);
+                transpose_const =
+                    std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{new_order.size()}, new_order);
+            }
+
+            fc_input_b = transpose->clone_with_new_inputs({fc_input_b->output(0), transpose_const});
+            ov::disable_constant_folding(fc_input_b);
+            result_nodes.push_back(fc_input_b);
+            fc_input_scale = transpose->clone_with_new_inputs({scale->output(0), transpose_const});
+            ov::disable_constant_folding(fc_input_scale);
+            result_nodes.push_back(fc_input_scale);
+            if (with_zero_point && ov::shape_size(optional_zero_point->output(0).get_shape()) > 1) {
+                fc_input_zp = transpose->clone_with_new_inputs({optional_zero_point->output(0), transpose_const});
+                ov::disable_constant_folding(fc_input_zp);
+                result_nodes.push_back(fc_input_zp);
+            }
+        }
+
+        fc_input_zp =
+            with_zero_point ? fc_input_zp : std::make_shared<ov::op::v0::Constant>(element::undefined, Shape{0});
+        ov::disable_constant_folding(fc_input_zp);
+        result_nodes.push_back(fc_input_zp);
+
+        auto new_fc = std::make_shared<ov::op::internal::FullyConnectedCompressed>(fc_input_a,
+                                                                                   fc_input_b,
+                                                                                   fc_input_bias,
+                                                                                   fc_input_scale,
+                                                                                   fc_input_zp,
+                                                                                   fc->get_output_type());
+
+        result_nodes.push_back(new_fc);
+        new_fc->set_friendly_name(fc->get_friendly_name());
+        ov::copy_runtime_info(m.get_matched_nodes(), result_nodes);
+        ov::replace_node(fc, new_fc);
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(fully_connected_m,
+                                                          "ConvertFullyConnectedToFullyConnectedCompressed");
+    this->register_matcher(m, callback);
+}
diff --git a/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp
new file mode 100644
index 00000000000000..908e36a51a7eb9
--- /dev/null
+++ b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/op_conversions/convert_fc_to_quantized_legacy.hpp"
+
+#include <memory>
+
+#include "openvino/core/rt_info.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/pass/pattern/op/label.hpp"
+#include "openvino/pass/pattern/op/pattern.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "ov_ops/fully_connected.hpp"
+#include "ov_ops/fully_connected_quantized_legacy.hpp"
+#include "transformations/utils/utils.hpp"
+
+ov::pass::ConvertFCToFCQuantizedLegacy::ConvertFCToFCQuantizedLegacy() {
+    using namespace ov::pass::pattern;
+
+    std::vector<element::Type> activation_types{ov::element::u8, ov::element::i8};
+    std::vector<element::Type> weights_types{ov::element::i8};
+
+    auto activations_m = pattern::any_input(ov::pass::pattern::type_matches_any(activation_types));
+    auto weights_m = wrap_type<ov::op::v0::Constant>(ov::pass::pattern::type_matches_any(weights_types));
+    auto bias_m = pattern::any_input();
+
+    auto fully_connected_m = wrap_type<ov::op::internal::FullyConnected>({activations_m, weights_m, bias_m});
+    auto dequantization_scales_m = wrap_type<ov::op::v0::Constant>();
+    auto multiply_m = wrap_type<ov::op::v1::Multiply>({fully_connected_m, dequantization_scales_m});
+
+    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+
+        auto fc_output = pattern_map.at(fully_connected_m);
+        auto activations = pattern_map.at(activations_m);
+        auto weights = pattern_map.at(weights_m);
+        auto bias = pattern_map.at(bias_m);
+        auto multiply = pattern_map.at(multiply_m);
+        auto dequantization_scales = pattern_map.at(dequantization_scales_m);
+        const auto& fc_output_shape = fc_output.get_partial_shape();
+        const auto& multiply_output_shape = multiply.get_partial_shape();
+
+        if (*fc_output_shape.rbegin() != *multiply_output_shape.rbegin()) {
+            return false;
+        }
+
+        auto fc_node = std::dynamic_pointer_cast<ov::op::internal::FullyConnected>(
+            pattern_map.at(fully_connected_m).get_node_shared_ptr());
+
+        ov::NodeVector new_ops;
+        auto zp = std::make_shared<ov::op::v0::Constant>(element::undefined, Shape{0});
+        new_ops.push_back(zp);
+
+        auto fc_quantized =
+            std::make_shared<ov::op::internal::FullyConnectedQuantizedLegacy>(activations,
+                                                                              weights,
+                                                                              bias,
+                                                                              dequantization_scales,
+                                                                              zp,
+                                                                              fc_node->get_output_type());
+        new_ops.push_back(fc_quantized);
+
+        const auto& multiply_node = multiply.get_node_shared_ptr();
+        fc_quantized->set_friendly_name(multiply_node->get_friendly_name());
+
+        ov::copy_runtime_info({multiply_node, fc_node}, new_ops);
+        ov::replace_node(multiply_node, fc_quantized);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(multiply_m, "ConvertFullyConnectedToFullyConnectedQuantized");
+    this->register_matcher(m, callback);
+}
diff --git a/src/frontends/ir/src/ir_deserializer.cpp b/src/frontends/ir/src/ir_deserializer.cpp
index 7c8b6e9d4b97ab..2d1dfba956ea72 100644
--- a/src/frontends/ir/src/ir_deserializer.cpp
+++ b/src/frontends/ir/src/ir_deserializer.cpp
@@ -10,6 +10,7 @@
 #include "openvino/core/except.hpp"
 #include "openvino/core/meta_data.hpp"
 #include "openvino/core/rt_info/weightless_caching_attributes.hpp"
+#include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/loop.hpp"
@@ -831,7 +832,9 @@ std::shared_ptr<ov::Node> ov::XmlDeserializer::create_node(const std::vector<ov:
                            " has incorrect input with index ",
                            i,
                            "!");
-        if (ov::element::Type_t::undefined == inputs[i].get_element_type())
+
+        if (is_type<op::v0::Parameter>(inputs[i].get_node_shared_ptr()) &&
+            ov::element::Type_t::undefined == inputs[i].get_element_type())
             OPENVINO_THROW(params.type,
                            " layer ",
                            params.name,
diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp
index 3b6440e56c3272..30884bbe649962 100644
--- a/src/plugins/intel_cpu/src/cpu_types.cpp
+++ b/src/plugins/intel_cpu/src/cpu_types.cpp
@@ -41,6 +41,9 @@ static const TypeToNameMap& get_type_to_name_tbl() {
         {"GroupConvolution", Type::Convolution},
         {"MatMul", Type::MatMul},
         {"FullyConnected", Type::FullyConnected},
+        {"FullyConnectedCompressed", Type::FullyConnected},
+        {"FullyConnectedQuantizedLegacy", Type::FullyConnected},
+        {"FullyConnectedQuantized", Type::FullyConnected},
         {"MaxPool", Type::Pooling},
         {"AvgPool", Type::Pooling},
         {"AdaptiveMaxPool", Type::AdaptivePooling},
@@ -469,6 +472,10 @@ std::string algToString(const Algorithm alg) {
         CASE(FQCommon);
         CASE(FQQuantization);
         CASE(FQBinarization);
+        CASE(FullyConnectedCommon);
+        CASE(FullyConnectedCompressed);
+        CASE(FullyConnectedQuantized);
+        CASE(FullyConnectedQuantizedLegacy);
         CASE(ROIPoolingMax);
         CASE(ROIPoolingBilinear);
         CASE(ROIAlignMax);
diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h
index 9461526184b0bf..71088c22af8336 100644
--- a/src/plugins/intel_cpu/src/cpu_types.h
+++ b/src/plugins/intel_cpu/src/cpu_types.h
@@ -213,6 +213,12 @@ enum class Algorithm {
     EltwiseBitwiseLeftShift,
     EltwiseBitwiseRightShift,
 
+    // FullyConnected algorithms
+    FullyConnectedCommon,
+    FullyConnectedCompressed,
+    FullyConnectedQuantized,
+    FullyConnectedQuantizedLegacy,
+
     // FakeQuantize algorithms
     FQCommon,
     FQQuantization,
diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
index 2f82fbe553ae19..70d28f1f4ac739 100644
--- a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
+++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
@@ -11,21 +11,69 @@
 #include <memory>
 #include <oneapi/dnnl/dnnl.hpp>
 
+#include "cpu_types.h"
 #include "memory_desc/dnnl_blocked_memory_desc.h"
+#include "nodes/executors/memory_arguments.hpp"
 #include "openvino/core/type/element_type.hpp"
+#include "utils/cpu_utils.hpp"
 #include "utils/debug_capabilities.h"
 
 namespace ov {
 namespace intel_cpu {
 
+static std::vector<float> getDeQuantizedScales(const MemoryArgs& memory) {
+    if (!memory.count(ARG_DST_DEQ_SCALE))
+        return {};
+
+    auto scalesMemory = memory.at(ARG_DST_DEQ_SCALE);
+
+    auto scalesData = static_cast<const float*>(scalesMemory->getData());
+
+    if (!scalesData)
+        return {};
+
+    auto dstShape = memory.at(ARG_DST)->getShape();
+    auto dqScalesShape = scalesMemory->getShape();
+
+    auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size());
+
+    auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies<size_t>());
+
+    std::vector<float> DQScales(scaleSize, 1.0);
+
+    OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize,
+                    "set invalid scales size , DQScales vector size: ",
+                    DQScales.size(),
+                    ", scale data size: ",
+                    scaleSize);
+
+    // @todo do we really need to broadcast dq scales and then resize them back?
+    if (scaleSize > DQScales.size())
+        DQScales.resize(scaleSize, DQScales[0]);
+    if (1 == scaleSize) {
+        std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) {
+            return (scalesData[0] * val);
+        });
+    } else {
+        for (size_t i = 0; i < DQScales.size(); i++) {
+            DQScales[i] *= scalesData[i];
+        }
+    }
+    if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) {
+            return (val == DQScales[0]);
+        }))
+        DQScales.resize(1);
+
+    return DQScales;
+}
+
 DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps,
                                          const dnnl::engine& engine,
                                          const VectorDims& outputDims,
                                          const size_t indexOfOutputChannelDim,
                                          const bool isInt8,
                                          const int weiScaleMaskPerChannel,
-                                         const std::vector<float>& DQScales,
-                                         const bool hasBias,
+                                         const MemoryArgs& memory,
                                          const dnnl::memory::data_type outDataType)
     : engine(engine),
       postOps(postOps),
@@ -39,6 +87,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps,
     dimsPerOC = dimsPerTensor = VectorDims(outputDims.size(), 1);
     dimsPerOC[idxOC] = OC;
 
+    const auto& DQScales = getDeQuantizedScales(memory);
     // generalise dq scales, so extra logic is necessary here.
     if (isINT8) {
         wei_scale_values = DQScales.empty() ? std::vector<float>{1.0} : DQScales;
@@ -49,6 +98,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps,
         updateWeiScales();
         // If having the bias, attr weight scale can't be updated for further ops-ops optimization.
         // ONEDNN 3.x quantization for scheme: QuantizedInput * QuantizedWeight * DQScale + Bias.
+        const bool hasBias = !memory.at(ARG_BIAS)->getDesc().empty();
         weightScaleAvailable = !hasBias;
     } else if (!DQScales.empty()) {
         // DQ scale is fused but swiching back to non-INT8 for execution in some cases.
@@ -325,9 +375,9 @@ static OptimizedFormula updateOptimizedFormula(const FakeQuantizePostOp& postOp,
 }
 
 bool DnnlPostOpsComposer::appendAttrPostOps(const FakeQuantizePostOp& postOp,
-                                               bool isLastPostOp,
-                                               bool doRounding,
-                                               bool allowBinary) {
+                                            bool isLastPostOp,
+                                            bool doRounding,
+                                            bool allowBinary) {
     DEBUG_LOG("isLastPostOp=",
               isLastPostOp,
               ", outDataType=",
@@ -541,9 +591,9 @@ bool DnnlPostOpsComposer::appendShift(const std::vector<float>& shift, bool allo
 }
 
 bool DnnlPostOpsComposer::appendLinear(const std::vector<float>& scale,
-                                          const std::vector<float>& shift,
-                                          bool isLastPostOp,
-                                          bool allowBinary) {
+                                       const std::vector<float>& shift,
+                                       bool isLastPostOp,
+                                       bool allowBinary) {
     if (scale.size() == 1 && shift.size() == 1) {
         if (shift[0] == 0.0f)
             return appendScale(scale, isLastPostOp, allowBinary);
@@ -599,15 +649,27 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr,
     if (shape.size() == 1 && shape[0] == 1) {
         shape.push_back(1);
     }
+
     if (shape.size() != 2 && shape.size() != 3)
-         OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape");
+        OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape");
 
-    Shape dstShape = needTranspose ? Shape({shape[0], shape[1]}) : Shape({shape[shape.size() - 1], shape[0]});
-    DnnlBlockedMemoryDesc dstMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(dstPrc), dnnl::memory::format_tag::io);
-    auto dstMem = std::make_shared<Memory>(engine, dstMemoryDesc);
+    // weights without batch: (OC, G)
+    // weights with batch: (B, OC, G)
+    const size_t OC = shape[shape.size() - 2];
+    const size_t G =  shape[shape.size() - 1];
+
+    Shape dstShape = Shape({OC, G});
 
+    DnnlBlockedMemoryDesc dstMemoryDesc(dstShape,
+                                        DnnlExtensionUtils::ElementTypeToDataType(dstPrc),
+                                        dnnl::memory::format_tag::io);
+    auto dstMem = std::make_shared<Memory>(engine, dstMemoryDesc);
     auto srcFormat = needTranspose ? dnnl::memory::format_tag::oi : dnnl::memory::format_tag::io;
-    DnnlBlockedMemoryDesc srcMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()), srcFormat);
+
+    DnnlBlockedMemoryDesc srcMemoryDesc(
+        dstShape,
+        DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()),
+        srcFormat);
     auto srcMem = std::make_shared<Memory>(engine, srcMemoryDesc, paramsPtr->getData());
 
     dstMem->load(*srcMem);
@@ -615,25 +677,32 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr,
     return dstMem;
 }
 
-void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision) {
+void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr,
+                                                    bool needTranspose,
+                                                    ov::element::Type dstPrecision) {
     if (scales_ptr == nullptr)
         return;
 
     auto scalesMem = prepackDecompressionParams(scales_ptr, needTranspose, dstPrecision, engine);
     attr.set_scales_dims(DNNL_ARG_WEIGHTS,
-        DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision));
+                         DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()),
+                         DnnlExtensionUtils::ElementTypeToDataType(dstPrecision));
     cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = std::move(scalesMem);
     dnnlArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
         cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS]->getPrimitive();
 }
 
-void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, bool needTranspose, ov::element::Type dstPrecision) {
+void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr,
+                                                        bool needTranspose,
+                                                        ov::element::Type dstPrecision) {
     if (zero_points_ptr == nullptr)
         return;
 
-    auto zeroPointsMem = prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine);
+    auto zeroPointsMem =
+        prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine);
     attr.set_zero_points_dims(DNNL_ARG_WEIGHTS,
-        DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision));
+                              DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()),
+                              DnnlExtensionUtils::ElementTypeToDataType(dstPrecision));
     cpuArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem;
     dnnlArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem->getPrimitive();
 }
diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.h b/src/plugins/intel_cpu/src/dnnl_postops_composer.h
index c07ec0f608b6db..8c2718aaaed4d5 100644
--- a/src/plugins/intel_cpu/src/dnnl_postops_composer.h
+++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.h
@@ -27,8 +27,7 @@ class DnnlPostOpsComposer {
                         const size_t indexOfOutputChannelDim,
                         const bool isINT8,
                         const int weiScaleMaskPerChannel,
-                        const std::vector<float>& DQScales,
-                        const bool hasBias,
+                        const MemoryArgs& memory,
                         const dnnl::memory::data_type outDataType);
     DnnlPrimitiveAttrs compose();
     void appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision);
diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp
index 82bde8edae2b4a..c49b924477f694 100644
--- a/src/plugins/intel_cpu/src/edge.cpp
+++ b/src/plugins/intel_cpu/src/edge.cpp
@@ -5,6 +5,7 @@
 #include "edge.h"
 #include "node.h"
 #include "dnnl_extension_utils.h"
+#include "openvino/core/type/element_type.hpp"
 #include "openvino/util/pp.hpp"
 
 using namespace dnnl;
@@ -212,6 +213,10 @@ Edge::ReorderStatus Edge::needReorder() {
     bool optimized = false;
     auto inputPortDesc = getInputPortDesc();
     auto outPortDesc = getOutputPortDesc();
+
+    if (inputPortDesc->getMemDesc()->getPrecision() == element::undefined)
+        return ReorderStatus::No;
+
     // Check whether the child node may accept the parent produced tensor
     if (!outPortDesc->isCompatible(*inputPortDesc)) {
         // Performance optimization which exploit the fact that some tensors do not need actual data reordering to be read using different descriptors
@@ -410,6 +415,9 @@ const MemoryDesc& Edge::getOutputDesc() const {
 }
 
 const MemoryDesc& Edge::getDesc() const {
+    if (getInputDesc().getPrecision() == element::undefined)
+        return getInputDesc();
+
     if (!getInputDesc().isCompatible(getOutputDesc()))
         OPENVINO_THROW("Cannot get descriptor for edge: ", getParent()->getName(), "->", getChild()->getName());
 
diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
index a29282d4af3101..e6dbc04b0ca6a4 100644
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@@ -7,6 +7,10 @@
 #include "openvino/core/op_extension.hpp"
 #include "ov_ops/augru_cell.hpp"
 #include "ov_ops/augru_sequence.hpp"
+#include "ov_ops/fully_connected.hpp"
+#include "ov_ops/fully_connected_compressed.hpp"
+#include "ov_ops/fully_connected_quantized_legacy.hpp"
+#include "ov_ops/fully_connected_quantized.hpp"
 #include "ov_ops/gather_compressed.hpp"
 #include "ov_ops/multiclass_nms_ie_internal.hpp"
 #include "ov_ops/nms_ie_internal.hpp"
@@ -16,7 +20,6 @@
 #include "ov_ops/type_relaxed.hpp"
 #include "snippets/op/subgraph.hpp"
 #include "transformations/cpu_opset/common/op/causal_mask_preprocess.hpp"
-#include "transformations/cpu_opset/common/op/fully_connected.hpp"
 #include "transformations/cpu_opset/common/op/leaky_relu.hpp"
 #include "transformations/cpu_opset/common/op/ngram.hpp"
 #include "transformations/cpu_opset/common/op/power_static.hpp"
@@ -70,7 +73,6 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
 #endif
 
 #define CPU_EXTENSIONS                                                      \
-    OP_EXTENSION(ov::intel_cpu::FullyConnectedNode)                         \
     OP_EXTENSION(ov::intel_cpu::LeakyReluNode)                              \
     OP_EXTENSION(ov::intel_cpu::PowerStaticNode)                            \
     OP_EXTENSION(ov::intel_cpu::CausalMaskPreprocessNode)                   \
@@ -85,6 +87,10 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
     OP_EXTENSION(ov::op::internal::NmsStaticShapeIE<ov::op::v8::MatrixNms>) \
     OP_EXTENSION(ov::op::internal::RMS)                                     \
     OP_EXTENSION(ov::op::internal::RoPE)                                    \
+    OP_EXTENSION(ov::op::internal::FullyConnected)                          \
+    OP_EXTENSION(ov::op::internal::FullyConnectedCompressed)                \
+    OP_EXTENSION(ov::op::internal::FullyConnectedQuantizedLegacy)           \
+    OP_EXTENSION(ov::op::internal::FullyConnectedQuantized)                 \
     OP_EXTENSION_X64(ov::intel_cpu::MHANode)                                \
     OP_EXTENSION_X64(ov::intel_cpu::InteractionNode)                        \
     OP_EXTENSION_X64(ov::intel_cpu::LLMMLPNode)                             \
diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp
index 61590b8691f4b2..94f54fc4c59b55 100644
--- a/src/plugins/intel_cpu/src/graph_optimizer.cpp
+++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp
@@ -67,10 +67,6 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) {
     FuseConvMatmulFCDeconvAndDQScales(graph);
     graph.RemoveDroppedNodes();
 
-    OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndWeightsDecompression");
-    FuseFCAndWeightsDecompression(graph);
-    graph.RemoveDroppedNodes();
-
     OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndBias");
     FuseConvolutionMatMulDeconvAndBias(graph);
     graph.RemoveDroppedNodes();
@@ -217,8 +213,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) {
         auto scaleNode = node->getParentEdgeAt(1)->getParent();
         if (!(parentNode->getType() == Type::Convolution
                         || parentNode->getType() == Type::MatMul
-                        || parentNode->getType() == Type::Deconvolution
-                        || parentNode->getType() == Type::FullyConnected))
+                        || parentNode->getType() == Type::Deconvolution))
             return false;
         if (!scaleNode->isConstant())
             return false;
@@ -292,257 +287,6 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) {
     }
 }
 
-void GraphOptimizer::FuseFCAndWeightsDecompression(Graph &graph) {
-    std::set<ov::element::Type> supportedWeightsPrecisions{
-        ov::element::u8, ov::element::i8, ov::element::nf4, ov::element::u4, ov::element::i4, ov::element::f4e2m1};
-    const std::set<ov::element::Type> supportedDataPrecisions{ov::element::f32, ov::element::bf16};
-    auto expectedNode = [](NodePtr node, Type expectedType) {
-        return node->getType() == expectedType && node->getChildEdges().size() == 1;
-    };
-
-#define SKIP_FUSION_FOR_NODE(node)                                                   \
-    DEBUG_LOG("FuseFCAndWeightsDecompression can't be applied for node ", node->getName()); \
-    continue
-
-    if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2))
-        return;
-
-    auto& graphNodes = graph.GetNodes();
-    for (size_t i = 0; i < graphNodes.size(); i++) {
-        const auto fcNode = std::dynamic_pointer_cast<node::FullyConnected>(graphNodes[i]);
-        if (fcNode == nullptr)
-            continue;
-
-        auto parent = fcNode->getParentEdgeAt(1)->getParent();
-        const bool withTranspose = parent->getType() == Type::Transpose;
-        const NodePtr transposeNode = withTranspose ? parent : nullptr;
-        if (transposeNode)
-            parent = transposeNode->getParentEdgeAt(0)->getParent();
-        // Compressed weights can be shared between several FC layers
-        const bool is_shared_decompression = parent->getChildEdges().size() > 1;
-
-        const bool withReshape = parent->getType() == Type::Reshape;
-        const auto reshapeNode = withReshape ? parent : nullptr;
-        if (reshapeNode) {
-            parent = reshapeNode->getParentEdgeAt(0)->getParent();
-        }
-
-        const auto multiplyNode = parent;
-        if (multiplyNode->getType() != Type::Eltwise || multiplyNode->getAlgorithm() != Algorithm::EltwiseMultiply ||
-            !multiplyNode->isConstant()) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-
-        CPU_GRAPH_OPTIMIZER_SCOPE(FuseFCAndWeightsDecompression);
-        const auto mulParent1 = multiplyNode->getParentEdgeAt(1)->getParent();
-        NodePtr multiplyParent, multiplyConvertNode, multiplyConstNode;
-        multiplyParent = mulParent1;
-        if (multiplyParent->getType() == Type::Convert) {
-            multiplyConvertNode = multiplyParent;
-            multiplyParent = multiplyConvertNode->getParentEdgeAt(0)->getParent();
-        }
-        multiplyConstNode = multiplyParent;
-        if (multiplyConstNode->getType() != Type::Input) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-        const bool withMultiplyConvert = multiplyConvertNode != nullptr;
-
-        const auto mulParent0 = multiplyNode->getParentEdgeAt(0)->getParent();
-        const bool withSubtract = mulParent0->getAlgorithm() == Algorithm::EltwiseSubtract;
-        NodePtr subtractNode, subtractConvertNode, subtractConstNode;
-        if (withSubtract) {
-            subtractNode = mulParent0;
-            if (!expectedNode(subtractNode, Type::Eltwise)) {
-                SKIP_FUSION_FOR_NODE(fcNode);
-            }
-            auto subtractParent = subtractNode->getParentEdgeAt(1)->getParent();
-            if (subtractParent->getType() == Type::Convert) {
-                subtractConvertNode = subtractParent;
-                subtractParent = subtractConvertNode->getParentEdgeAt(0)->getParent();
-            }
-            subtractConstNode = subtractParent;
-            if (subtractConstNode->getType() != Type::Input) {
-                SKIP_FUSION_FOR_NODE(fcNode);
-            }
-        }
-
-        const bool withSubtractConvert = subtractConvertNode != nullptr;
-        const auto convertNode = withSubtract ? subtractNode->getParentEdgeAt(0)->getParent() : mulParent0;
-        if (!expectedNode(convertNode, Type::Convert)) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-        const auto weightsNode = convertNode->getParentEdgeAt(0)->getParent();
-        if (weightsNode->getType() != Type::Input) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-
-        // Precision limitations
-        if (supportedDataPrecisions.find(fcNode->getOriginalInputPrecisionAtPort(0)) == supportedDataPrecisions.end()) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-        if (supportedWeightsPrecisions.find(weightsNode->getOriginalOutputPrecisionAtPort(0)) == supportedWeightsPrecisions.end()) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-        if (withSubtract &&
-            !one_of(subtractConstNode->getOriginalOutputPrecisionAtPort(0), weightsNode->getOriginalOutputPrecisionAtPort(0), ov::element::f32)) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-
-        // Shape limitations
-        const auto weightsShape = weightsNode->getOutputShapeAtPort(0);
-        if (weightsShape != multiplyNode->getOutputShapeAtPort(0)) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-        if (reshapeNode && (reshapeNode->getInputShapeAtPort(0).getRank() != 3 || reshapeNode->getOutputShapeAtPort(0).getRank() != 2)) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-
-        VectorDims decompressionConstShape;
-        const auto fcInputWeightsShape = fcNode->getInputShapeAtPort(1);
-        int groupNum = 1;
-        // Ordinary case: one decompression group
-        if (fcInputWeightsShape.getRank() == weightsShape.getRank()) {
-            const auto& out_channels = fcInputWeightsShape.getDims()[0];
-            decompressionConstShape = withTranspose ? VectorDims{1, out_channels} : VectorDims{out_channels, 1};
-        } else {
-            // Group decompression case: last 3 dimension (there could be also prepending '1's in the beginning) of weights shape must be:
-            // [N, G, O], if transpose = true
-            // [O, N, G], otherwise.
-            // O - output channels
-            // N - number of groups
-            // G - group size
-            const auto& weights_dims = weightsShape.getStaticDims();
-            const auto& N = withTranspose ? *(weights_dims.rbegin() + 2) : *(weights_dims.rbegin() + 1);
-            const auto& O = withTranspose ? *weights_dims.rbegin() : *(weights_dims.rbegin() + 2);
-            // Group decompression is applied by O and N dims
-            decompressionConstShape = withTranspose ? VectorDims{N, 1, O} : VectorDims{O, N, 1};
-            groupNum = N;
-        }
-
-        auto check_decompression_shape = [&decompressionConstShape](const VectorDims& shape_to_check) {
-            if (shape_to_check.size() > decompressionConstShape.size())
-                return false;
-            if (std::all_of(shape_to_check.begin(), shape_to_check.end(), [](Dim x) { return x == 1; }))
-                return true;
-            const auto comparison_start_pos = decompressionConstShape.size() - shape_to_check.size();
-            // in case of different ranks shapes are compared taking into account ranks numpy broadcasting
-            return std::equal(shape_to_check.begin(), shape_to_check.end(), decompressionConstShape.begin() + comparison_start_pos);
-        };
-        if (!check_decompression_shape(multiplyConstNode->getOutputShapeAtPort(0).getDims())) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-        if (withSubtract && !check_decompression_shape(subtractConstNode->getOutputShapeAtPort(0).getDims())) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-
-        const size_t OC = fcInputWeightsShape.getDims()[0];
-        const size_t IC = fcInputWeightsShape.getDims()[1];
-        // HW specific shape limitations
-        if (impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx) &&
-            fcNode->getOriginalInputPrecisionAtPort(0) == ov::element::bf16) {
-            // OneDNN AMX IP implementation has limited shapes support due to performance considerations. As a current solution conditions below are copied
-            // from OneDNN to make sure correct IP impl will be used since fallback one doesn't support weights decompression feature.
-            size_t simdWidth = 16;
-            size_t vnniFactor = 2;
-            size_t maxSize = 512;
-            auto amxRow = vnniFactor * simdWidth;
-
-            if ((IC <= amxRow && OC <= amxRow) || (IC <= maxSize && OC <= maxSize && IC % amxRow != 0)) {
-                SKIP_FUSION_FOR_NODE(fcNode);
-            }
-        }
-
-        // OneDNN IP primitive provides limited decompression params support
-        if (IC % groupNum != 0 || IC / groupNum < 4 || OC == 1) {
-            SKIP_FUSION_FOR_NODE(fcNode);
-        }
-
-        // Fusion processing
-        auto *multiplyInputNode = dynamic_cast<node::Input *>(multiplyConstNode.get());
-        OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", multiplyConstNode->getName(), " to Input node.");
-        fcNode->fuseDecompressionMultiply(multiplyInputNode->getMemoryPtr());
-
-        if (withSubtract) {
-            auto *subtractInputNode = dynamic_cast<node::Input *>(subtractConstNode.get());
-            OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", subtractConstNode->getName(), " to Input node.");
-            fcNode->fuseDecompressionSubtract(subtractInputNode->getMemoryPtr());
-        }
-
-        fcNode->addOriginalLayer(multiplyNode->getOriginalLayers());
-        fcNode->addOriginalLayer(convertNode->getOriginalLayers());
-        if (withSubtract)
-            fcNode->addOriginalLayer(subtractNode->getOriginalLayers());
-        if (withSubtractConvert)
-            fcNode->addOriginalLayer(subtractConvertNode->getOriginalLayers());
-        if (withMultiplyConvert)
-            fcNode->addOriginalLayer(multiplyConvertNode->getOriginalLayers());
-
-        const auto& weightsPrecision = weightsNode->getOriginalOutputPrecisionAtPort(0);
-        if (withTranspose) {
-            transposeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision);
-            transposeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision);
-        }
-        if (withReshape) {
-            reshapeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision);
-            reshapeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision);
-        }
-        fcNode->setOriginalInputPrecisionAtPort(1, weightsPrecision);
-
-        // If decompression subgraph is shared with other nodes, it mustn't be removed.
-        // In this case, the current FC is reconnected to the weights
-        if (is_shared_decompression) {
-            const auto weights_out_edge = weightsNode->getChildEdges()[0].lock();
-            const auto fc_weights_path_edge = withTranspose ? transposeNode->getParentEdgeAt(0)
-                                                            : fcNode->getParentEdgeAt(1);
-            const auto inNum = weights_out_edge->getInputNum();
-            const auto outNum = fc_weights_path_edge->getOutputNum();
-            graph.RemoveEdge(fc_weights_path_edge);
-            // In case of shared group decompression, Reshape node has to be copied for the current FC
-            if (withReshape) {
-                const auto& reshapeOutShape = reshapeNode->getOutputShapeAtPort(0).getStaticDims();
-                auto reshapeConst = std::make_shared<ov::opset1::Constant>(ov::element::i32,
-                                                                           ov::Shape{reshapeOutShape.size()},
-                                                                           reshapeOutShape);
-                auto reshapeDummyInput = std::make_shared<ov::opset1::Parameter>(reshapeNode->getOriginalInputPrecisionAtPort(0),
-                                                                                 reshapeNode->getInputShapeAtPort(0).toPartialShape());
-                const auto reshape = std::make_shared<ov::opset1::Reshape>(reshapeDummyInput, reshapeConst, false);
-                reshape->set_friendly_name(reshapeNode->getName() + "_copy");
-                const auto cpuReshape = std::make_shared<ov::intel_cpu::node::Reshape>(reshape, graph.getGraphContext());
-                graph.InsertNode(weightsNode, withTranspose ? transposeNode : fcNode, cpuReshape, inNum, outNum, false);
-                const auto cpuReshapeConst = std::make_shared<node::Input>(reshapeConst, graph.getGraphContext());
-                graph.AddNode(cpuReshapeConst);
-                graph.CreateEdge(cpuReshapeConst, cpuReshape, 0, 1);
-            } else {
-                graph.CreateEdge(weightsNode, withTranspose ? transposeNode : fcNode, inNum, outNum);
-            }
-        } else {
-            // If decompression subgraph is not shared with other nodes, it can be removed
-            if (withSubtract)
-                graph.RemoveEdge(subtractNode->getParentEdgeAt(1));
-            if (withSubtractConvert) {
-                // SubtractConvert is removed only if there are no other consumers (e.g. CompressedGather)
-                const auto& restChilds = subtractConvertNode->getChildEdges();
-                if (restChilds.empty())
-                    graph.RemoveEdge(subtractConvertNode->getParentEdgeAt(0));
-            }
-            graph.RemoveEdge(multiplyNode->getParentEdgeAt(1));
-            if (withMultiplyConvert) {
-                // MultiplyConvert is removed only if there are no other consumers (e.g. CompressedGather)
-                const auto& restChilds = multiplyConvertNode->getChildEdges();
-                if (restChilds.empty())
-                    graph.RemoveEdge(multiplyConvertNode->getParentEdgeAt(0));
-            }
-
-            graph.DropNode(convertNode);
-            if (withSubtract)
-                graph.DropNode(subtractNode);
-            graph.DropNode(multiplyNode);
-        }
-        DEBUG_LOG("FuseFCAndWeightsDecompression finished for node ", fcNode->getName());
-    }
-#undef SKIP_FUSION_FOR_NODE
-}
-
 void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) {
     auto& graphNodes = graph.GetNodes();
 
@@ -556,7 +300,7 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) {
             return false;
 
         if (!deconv)
-            return (one_of(node->getType(), Type::Convolution, Type::MatMul, Type::FullyConnected) &&
+            return (one_of(node->getType(), Type::Convolution, Type::MatMul) &&
                    node->getParentEdges().size() == 2);
         else
             return deconv->canFuseBias();
@@ -984,9 +728,7 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) {
     auto isSuitablePattern = [](NodePtr parent) {
         bool res = true && parent->getType() == Type::Transpose
                         && parent->getChildEdges().size() == 1
-                        && parent->getChildEdgeAt(0)->getOutputNum() == 1
                         && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected
-                        && parent->getOutputShapeAtPort(0).getRank() == 2
                         && parent->isConstant();
         return res;
     };
diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h
index 886296a7c0053b..536ef468a09816 100644
--- a/src/plugins/intel_cpu/src/graph_optimizer.h
+++ b/src/plugins/intel_cpu/src/graph_optimizer.h
@@ -20,7 +20,6 @@ class GraphOptimizer {
 
 private:
     void FuseConvMatmulFCDeconvAndDQScales(Graph &graph);
-    void FuseFCAndWeightsDecompression(Graph &graph);
     void FuseConvolutionMatMulDeconvAndBias(Graph &graph);
     void FuseDeconvolutionAndSimpleOperation(Graph &graph);
     void FuseMultiplyAndAdd(Graph &graph);
diff --git a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h
index 4b641669262591..1575841cb2be9e 100644
--- a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h
+++ b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h
@@ -59,7 +59,9 @@ class EmptyMemoryDesc : public MemoryDesc {
     }
 
     MemoryDescPtr cloneWithNewPrecision(const ov::element::Type prec) const override {
-        OPENVINO_THROW("Clone an empty memory desc with any precision (", prec, ") is prohibited");
+        OPENVINO_ASSERT(prec == ov::element::undefined,
+                        "Clone an empty memory desc with defined precision: ", prec, " is prohibited");
+        return clone();
     }
 
 private:
diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
index de5c53429138c4..ee0a99c3bba44e 100644
--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@@ -6,6 +6,7 @@
 #include "cpu_types.h"
 #include "edge.h"
 #include "partitioned_mem_blk.h"
+#include "openvino/core/type/element_type.hpp"
 
 #include <memory>
 #include <oneapi/dnnl/dnnl.hpp>
@@ -1673,7 +1674,7 @@ bool Node::isInputTensorAtPortEmpty(size_t port) const {
     auto edge = getParentEdgeAt(port);
     if (one_of(edge->getStatus(), Edge::Status::Allocated, Edge::Status::Validated)) {
         auto&& mem = edge->getMemory();
-        if (mem.isDefined()) {
+        if (mem.isDefined() && !mem.getDesc().empty()) {
             return mem.getShape().hasZeroDims();
         }
     }
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
index cc42691950a3ff..9660178e1af4a4 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
@@ -11,6 +11,7 @@
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/memory_arguments.hpp"
 #include "utils/debug_capabilities.h"
+#include "utils/cpu_utils.hpp"
 #include "nodes/executors/debug_messages.hpp"
 #include "nodes/executors/implementation_utils.hpp"
 #include "nodes/convert.h"
@@ -201,9 +202,22 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
     MemoryArgs memoryArgs;
     memoryArgs[ARG_BIAS]  = memory.at(ARG_BIAS);
     memoryArgs[ARG_WEI]   = memory.at(ARG_WEI);
+
+    auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr();
+
+    // normalize weights to 2D
+    const auto& wgtDims = originalWeightsDesc->getShape().getStaticDims();
+    const VectorDims wgtDims2D = reshapeDownToRank<2>(wgtDims);
+
+    originalWeightsDesc = std::make_shared<CpuBlockedMemoryDesc>(originalWeightsDesc->getPrecision(), Shape{wgtDims2D});
+
+    auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc);
+    auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(aclfcAttrs.inputPrecision);
+    auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc);
+
     if (memory.at(ARG_SRC_0)->getShape().isDynamic()) {
         const auto& inShape = memory.at(ARG_SRC_0)->getShape();
-        const auto& wShape = memory.at(ARG_WEI)->getShape();
+        const auto& wShape = originalWeightsDesc->getShape();
         const auto& inDymmyDims = makeDummyInputDims(inShape, wShape);
         const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank());
         memoryArgs[ARG_SRC_0] = std::make_shared<Memory>(context->getEngine(),
@@ -214,6 +228,7 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
         memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0);
         memoryArgs[ARG_DST]   = memory.at(ARG_DST);
     }
+
     // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor
     // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability
     auto aclWeightsRepack = std::make_shared<acl_fc_executor::ACLWeightFormatGenerator>(attrs, postOps, memoryArgs);
@@ -221,13 +236,6 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
     expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED;
     weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI);
 
-    MemoryPtr dstMemPtr = std::make_shared<Memory>(context->getEngine(),
-                                                   memory.at(ARG_WEI)->getDescPtr()->cloneWithNewPrecision(aclfcAttrs.inputPrecision));
-    auto dstDesc = dstMemPtr->getDescPtr();
-    auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc);
-    auto weiDesc = memory.at(ARG_WEI)->getDescPtr();
-    auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weiDesc);
-
     if (isNeededReorder) {
         dnnl::impl::dim_t o_dim = 0;
         dnnl::impl::dim_t inner_dim = 1;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp
index 8f9d7ad0805e41..61aca683a37687 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp
@@ -157,8 +157,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const ConvAttrs& attrs,
         one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8;
     auto outputDataType = DnnlExtensionUtils::ElementTypeToDataType(dstDesc->getPrecision());
 
-    DnnlPostOpsComposer
-        dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, {}, attrs.withBias, outputDataType);
+    DnnlPostOpsComposer dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, memory, outputDataType);
 
     return dnnlpoc.compose();
 }
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
index fcb70d4753b2ce..780dbb6f2f3f11 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
@@ -9,6 +9,7 @@
 #include <common/primitive_attr.hpp>
 #include <common/primitive_desc_iface.hpp>
 #include <common/primitive_iface.hpp>
+#include <cstddef>
 #include <memory>
 #include <oneapi/dnnl/dnnl.hpp>
 #include <oneapi/dnnl/dnnl_common.hpp>
@@ -27,6 +28,7 @@
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/fullyconnected_config.hpp"
 #include "nodes/executors/memory_arguments.hpp"
+#include "utils/cpu_utils.hpp"
 #include "utils/debug_capabilities.h"
 
 namespace ov {
@@ -115,9 +117,10 @@ DnnlMemoryDescPtr DnnlFCPrimitive::makeTransposedWeightDescriptor(const DnnlMemo
         return srcDesc;
 
     const auto& weiDesc = srcDesc->getDnnlDesc();
-    const auto reorderedWeiDesc =
-        dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba};
-    const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims());
+    auto wDims = weiDesc.get_dims();
+    dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims);
+
+    const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, weiDesc.get_data_type(), dnnl::memory::format_tag::ba};
 
     return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc);
 }
@@ -140,12 +143,11 @@ bool DnnlFCPrimitive::useWeightsDecompressionImpl(const ov::element::Type inputT
     return false;
 }
 
-bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize,
-                                                 const MemoryDescPtr srcDesc,
-                                                 const MemoryDescPtr weightsDesc,
-                                                 MemoryCPtr scalesPtr,
-                                                 MemoryCPtr zpPtr,
-                                                 bool needTranspose) {
+static bool useDynamicQuantizationImpl(size_t dqGroupSize,
+                                       const MemoryDescPtr srcDesc,
+                                       const MemoryDescPtr weightsDesc,
+                                       const MemoryArgs& memory,
+                                       bool needTranspose) {
     if (dqGroupSize == 0)
         return false;
 
@@ -155,6 +157,8 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize,
 
     if (srcDesc->getPrecision() != ov::element::f32)
         return false;
+
+    MemoryCPtr zpPtr = memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS) ? memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS) : nullptr;
     // For dynamic quantization, VNNI accumulation requires weight to be unsigned.
     // To support dynamic quantization with weights symmetrically quantized as i8/i4
     // w/o zero-point, we will transform weight to u8/u4 weight with zp 128/8.
@@ -177,11 +181,15 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize,
     if (weightsDesc->getPrecision() == ov::element::u4) {
         int ic = weightsDesc->getShape().getStaticDims()[1];
         int minGroupSize = INT_MAX;
+
+        MemoryCPtr scalesPtr = memory.count(ARG_WEI | ARG_ATTR_SCALES) ? memory.at(ARG_WEI | ARG_ATTR_SCALES) : nullptr;
+
         if (scalesPtr && scalesPtr->getShape().getRank() == 3) {
             auto scalesDims = scalesPtr->getShape().getStaticDims();
             auto groupsNum = needTranspose ? scalesDims[1] : scalesDims[0];
             minGroupSize = ic / groupsNum;
         }
+
         if (zpPtr && zpPtr->getShape().getRank() == 3) {
             auto zpDims = zpPtr->getShape().getStaticDims();
             int groupsNum = needTranspose ? zpDims[1] : zpDims[0];
@@ -196,11 +204,6 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize,
     return true;
 }
 
-template <typename T>
-static std::vector<T> normalizeDimsTo2D(const std::vector<T>& dims) {
-    return {std::accumulate(dims.begin(), dims.end() - 1, (T)1, std::multiplies<T>()), dims[dims.size() - 1]};
-}
-
 static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs,
                                                const PostOps& postOps,
                                                const MemoryArgs& memory,
@@ -211,7 +214,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs,
     const auto& dstDesc = memory.at(ARG_DST)->getDescPtr();
 
     const auto& originalDims = dstDesc->getShape().getMinDims();
-    const auto& dims = normalizeDimsTo2D(originalDims);
+    const auto& dims = reshapeDownToRank<2>(originalDims);
 
     auto isINT8 =
         one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8;
@@ -223,21 +226,22 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs,
                                 dims.size() - 1,
                                 isINT8,
                                 1 << 0,
-                                attrs.dequantizationScales,
-                                !memory.at(ARG_BIAS)->getDesc().empty(),
+                                memory,
                                 outputDataType);
 
-    if (attrs.decompressionMultiplyPtr) {
-        auto dstPrc = attrs.decompressionMultiplyPtr->getPrecision();
+    if (memory.count(ARG_WEI | ARG_ATTR_SCALES)) {
+        auto dstPrc = memory.at(ARG_WEI | ARG_ATTR_SCALES)->getPrecision();
         if (dstPrc != f8e8m0 || useDynamicQuantization)
             dstPrc = ov::element::f32;
 
-        dnnlpoc.appendDecompressionScales(attrs.decompressionMultiplyPtr, !attrs.weightsNonTransposed, dstPrc);
+        dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES), !attrs.weightsNonTransposed, dstPrc);
     }
-    if (attrs.decompressionSubtractPtr) {
+
+    if (memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) {
         auto dstPrc = useDynamicQuantization ? ov::element::u8 : ov::element::f32;
-        dnnlpoc.appendDecompressionZeroPoints(attrs.decompressionSubtractPtr, !attrs.weightsNonTransposed, dstPrc);
+        dnnlpoc.appendDecompressionZeroPoints(memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS), !attrs.weightsNonTransposed, dstPrc);
     }
+
     if (useDynamicQuantization) {
         auto wei_precision = weiDesc->getPrecision();
         bool is_symmetric_weights = (wei_precision == ov::element::i8) || (wei_precision == ov::element::i4);
@@ -261,7 +265,7 @@ static dnnl::memory::desc normalizeDescriptor(const dnnl::memory::desc& desc) {
     const auto& dims = desc.get_dims();
 
     if (dims.size() > 2)
-        return desc.reshape(normalizeDimsTo2D(dims));
+        return desc.reshape(reshapeDownToRank<2>(dims));
 
     return desc;
 }
@@ -276,12 +280,13 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons
                                                                             const bool useWeightsDecompression) {
     const auto normalizedInputDesc = normalizeDescriptor(inputDesc);
     const auto normalizedOutputDesc = normalizeDescriptor(outputDesc);
+    const auto normalizedWeightDesc = normalizeDescriptor(weightDesc);
 
     const auto indt = normalizedInputDesc.get_data_type();
     auto wdt = indt;
 
     if (useWeightsDecompression) {
-        wdt = weightDesc.get_data_type();
+        wdt = normalizedWeightDesc.get_data_type();
 
         // dynamic quantization with symmetric quantized weights needs unsigned weights
         uint64_t dynQuantGroupSize = 0;
@@ -297,8 +302,8 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons
     }
 
     const dnnl::memory::desc weightsDesc =
-        useSparseWeights ? dnnl::memory::desc().sparse_desc(weightDesc.get_dims(), wdt)
-                         : dnnl::memory::desc(weightDesc.get_dims(), wdt, memory::format_tag::any);
+        useSparseWeights ? dnnl::memory::desc().sparse_desc(normalizedWeightDesc.get_dims(), wdt)
+                         : dnnl::memory::desc(normalizedWeightDesc.get_dims(), wdt, memory::format_tag::any);
 
     return dnnl::inner_product_forward::primitive_desc(engine,
                                                        dnnl::prop_kind::forward_inference,
@@ -387,8 +392,7 @@ DnnlShapeAgnosticDataPtr DnnlFCPrimitive::createShapeAgnosticData(const FCAttrs&
         useWeightsDecompression && useDynamicQuantizationImpl(attrs.dynamicQuantizationGroupSize,
                                                               srcDesc,
                                                               weiDesc,
-                                                              attrs.decompressionMultiplyPtr,
-                                                              attrs.decompressionSubtractPtr,
+                                                              memory,
                                                               !attrs.weightsNonTransposed);
 
     const auto postOpData = createPrimitiveAttrs(attrs, postOps, memory, context, useDynamicQuantization);
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp
index 5295b9655066cc..21247f149ca69f 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp
@@ -75,13 +75,6 @@ class DnnlFCPrimitive {
                                                    const DnnlShapeAgnosticDataPtr& shapeAgnosticData);
 
 private:
-    static bool useDynamicQuantizationImpl(size_t dqGroupSize,
-                                           const MemoryDescPtr srcDesc,
-                                           const MemoryDescPtr weightsDesc,
-                                           MemoryCPtr scalesPtr,
-                                           MemoryCPtr zpPtr,
-                                           bool needTranspose);
-
     dnnl::stream m_stream;
     dnnl::primitive_desc m_primDesc;
     impl_desc_type m_implType;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
index 1b8646c858e532..40c365ee5f4da5 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
@@ -27,6 +27,7 @@
 #include "nodes/executors/fullyconnected_config.hpp"
 #include "nodes/executors/matmul_config.hpp"
 #include "nodes/executors/memory_arguments.hpp"
+#include "utils/cpu_utils.hpp"
 #include "utils/debug_capabilities.h"
 
 namespace ov {
@@ -104,10 +105,10 @@ DnnlMemoryDescPtr DnnlMatMulPrimitive::makeTransposedWeightDescriptor(const Dnnl
     const auto& weiDesc = srcDesc->getDnnlDesc();
     auto wDims = weiDesc.get_dims();
     auto wDataType = weiDesc.get_data_type();
-    std::swap(wDims[wDims.size() - 1], wDims[wDims.size() - 2]);
+    dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims);
 
     const auto format = weightsNonTransposed ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba;
-    const auto transposedWeiDesc = dnnl::memory::desc{wDims, wDataType, format};
+    const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, wDataType, format};
 
     return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc);
 }
@@ -134,8 +135,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs,
                                 dims.size() - 1,
                                 isINT8,
                                 1 << 0,
-                                attrs.dequantizationScales,
-                                !memory.at(ARG_BIAS)->getDesc().empty(),
+                                memory,
                                 outputDataType);
 
     return dnnlpoc.compose();
@@ -262,7 +262,7 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAt
     const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr();
     const auto& biasDesc = memory.at(ARG_BIAS)->getDescPtr();
     auto dstDesc = memory.at(ARG_DST)->getDescPtr();
-    MatMulAttrs mmAttrs{false, false, attrs.dequantizationScales};
+    MatMulAttrs mmAttrs{false, false};
 
     const auto postOpData = createPrimitiveAttrs(mmAttrs, postOps, memory, context, false);
 
diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp
index 09b3b33cfe6b2f..d08c4ad8127325 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp
@@ -6,7 +6,6 @@
 
 #include "post_ops.hpp"
 #include "memory_arguments.hpp"
-#include "printers.hpp"
 
 namespace ov {
 namespace intel_cpu {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp
index f12795d5d1eb16..dd05cc58d43c32 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp
@@ -19,7 +19,6 @@
 
 namespace ov {
 namespace intel_cpu {
-using namespace executor;
 
 template <typename Attrs>
 class ExecutorFactory {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp
index ad6479597c6971..1699a845a3314b 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp
@@ -19,13 +19,8 @@ struct FCAttrs {
     bool withBias = false;
     bool weightsNonTransposed = false;
     bool sparseWeights = false;
-    // @todo only memory descriptors should be a part of attributes
-    // actual memory should be passed into "execute" or "prepareMemory" calls
-    std::vector<float> dequantizationScales;
-    // @todo should be passed as an additional memory input?
-    MemoryCPtr decompressionSubtractPtr;
-    MemoryCPtr decompressionMultiplyPtr;
     uint64_t dynamicQuantizationGroupSize;
+
     ov::intel_cpu::Config::ModelType modelType = ov::intel_cpu::Config::ModelType::Unknown;
 };
 
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
index 4cf6992985ecd3..10f472ddcd7283 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -441,8 +441,7 @@ const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
                         const ExecutorContext::CPtr context,
                         std::shared_ptr<DnnlShapeAgnosticData> shareAgnosticData) const {
                         MatMulAttrs matMulAttrs{false,
-                                                false,
-                                                attrs.dequantizationScales};
+                                                false};
                         auto primitive =
                             DefaultInstantiator<DnnlMatMulPrimitive, MatMulAttrs, DnnlShapeAgnosticData>{}(
                             memory,
diff --git a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp
index 9e484b24a2940e..e42bf3138bce91 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp
@@ -12,7 +12,6 @@ namespace intel_cpu {
 struct MatMulAttrs {
     bool transposeA;
     bool transposeB;
-    std::vector<float> dequantizationScales;
 };
 
 using MatMulConfig = executor::Config<MatMulAttrs>;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp
index c04ca39e845ee1..7150226d27c601 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp
@@ -14,7 +14,7 @@ namespace intel_cpu {
 using MemoryDescArgs = std::unordered_map<int, MemoryDescPtr>;
 using MemoryArgs     = std::unordered_map<int, MemoryPtr>;
 
-// @todo add more options
+// basic inputs
 #define ARG_SRC_0 1
 #define ARG_SRC   ARG_SRC_0
 #define ARG_SRC_1 2
@@ -24,6 +24,12 @@ using MemoryArgs     = std::unordered_map<int, MemoryPtr>;
 #define ARG_WEI_0 33
 #define ARG_WEI   ARG_WEI_0
 #define ARG_BIAS  41
+// legacy dequantization scale
+#define ARG_DST_DEQ_SCALE 53
+// scaling factors provided at execution time
+#define ARG_ATTR_SCALES 4096
+// zero points provided at execution time
+#define ARG_ATTR_ZERO_POINTS 8192
 
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp
index a03bfe2649413a..8fd945b773f262 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp
@@ -23,6 +23,10 @@ using namespace executor;
 using namespace dnnl;
 using namespace ov::element;
 
+static Dim batchDim(const VectorDims& dims) {
+    return std::accumulate(dims.begin(), dims.end() - 1, 1, std::multiplies<Dim>());
+}
+
 static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory,
                                      const ExecutorContext::CPtr context,
                                      const bool weightsTransposed) {
@@ -31,14 +35,15 @@ static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory,
     // Weights are transposed by MatMulConstTransposesExtraction
     // K is the IC of weight
     // the weight is reshaped to [-1, K] in ConvertMatMulToFC
-    const auto K = wgtDims[1];
-    const auto N = wgtDims[0];
+    Dim K = wgtDims.back();
+    Dim N = batchDim(wgtDims);
 
     auto packedBsize = mlas_sgemm_pack_get_size(N, K);
 
     auto create = [&]() {
         float* weightPtr = weightsMemory->getDataAs<float>();
         size_t ldb = weightsTransposed ? K : N;
+
         MemoryPtr _ptr = std::make_shared<Memory>(context->getEngine(),
                                                   intel_cpu::CpuBlockedMemoryDesc(i8, intel_cpu::Shape{packedBsize}));
         float* prepackedDst = _ptr->getDataAs<float>();
@@ -66,21 +71,10 @@ bool MlasGemmExecutor::supports(const FCConfig& config) {
         DEBUG_LOG("MlasGemmExecutor: PostOps are not supported");
         return false;
     }
-    const auto& weiDesc = config.descs.at(ARG_WEI);
-    const auto& dstDesc = config.descs.at(ARG_DST);
 
-    // MLAS cannot support weight dims > 2, e.g. [1,64,9,9] * [10,64,9,9]
-    const auto& weightsDims = weiDesc->getShape().getStaticDims();
-    if (weightsDims.size() > 2) {
-        if (!std::all_of(weightsDims.begin() + 2, weightsDims.end(), [](const Dim dim) {
-                return dim == 1;
-            })) {
-            DEBUG_LOG("MlasGemmExecutor: weights dims > 2 are not supported");
-            return false;
-        }
-    }
+    const auto& dstDesc = config.descs.at(ARG_DST);
 
-    if (config.attrs.withBias) {
+    if (!config.descs.at(ARG_BIAS)->empty()) {
         const auto& biaDesc = config.descs.at(ARG_BIAS);
         const auto& biasDims = biaDesc->getShape().getStaticDims();
         const auto& outDims = dstDesc->getShape().getDims();
@@ -108,24 +102,17 @@ MlasGemmExecutor::MlasGemmExecutor(const FCAttrs& attrs,
                                    const ExecutorContext::CPtr context)
     : m_attrs(attrs),
       m_memoryArgs(memory),
-      packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)) {}
+      packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)),
+      N(batchDim(memory.at(ARG_WEI)->getStaticDims())),
+      K(memory.at(ARG_WEI)->getStaticDims().back())
+{}
 
 bool MlasGemmExecutor::update(const MemoryArgs& memory) {
-    const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr();
     const auto& dstDesc = memory.at(ARG_DST)->getDescPtr();
-    const auto& wgtDims = weiDesc->getShape().getStaticDims();
-    // Weights are transposed by MatMulConstTransposesExtraction
-    // K is the IC of weight
-    // the weight is reshaped to [-1, K] in ConvertMatMulToFC
-    K = wgtDims[1];
-    N = wgtDims[0];
 
     const auto& outDims = dstDesc->getShape().getStaticDims();
-    if (outDims.size() > 2) {
-        M = std::accumulate(outDims.begin(), outDims.end() - 1, 1, std::multiplies<size_t>());
-    } else {
-        M = outDims[0];
-    }
+    M = outDims.size() > 2 ? batchDim(outDims) : outDims[0];
+
     return true;
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 31ae4f26cc08a1..0f5c46e8bcd7cd 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -10,6 +10,7 @@
 
 #include "common/cpu_convert.h"
 #include "common/cpu_memcpy.h"
+#include "cpu_types.h"
 #include "dnnl_extension_utils.h"
 #include "executors/memory_arguments.hpp"
 #include "graph_context.h"
@@ -19,11 +20,16 @@
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/fullyconnected_config.hpp"
+#include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/runtime/threading/cpu_message.hpp"
+#include "ov_ops/fully_connected.hpp"
+#include "ov_ops/fully_connected_quantized.hpp"
+#include "ov_ops/fully_connected_quantized_legacy.hpp"
+#include "ov_ops/fully_connected_compressed.hpp"
 #include "post_ops.hpp"
 #include "shape_inference/custom/fullyconnected.hpp"
-#include "transformations/cpu_opset/common/op/fully_connected.hpp"
+#include "transformations/utils/utils.hpp"
 #include "utils/debug_capabilities.h"
 #include "utils/general_utils.h"
 
@@ -39,25 +45,76 @@ namespace node {
 bool FullyConnected::isSupportedOperation(const std::shared_ptr<const ov::Node>& op,
                                           std::string& errorMessage) noexcept {
     try {
-        const auto fc = std::dynamic_pointer_cast<const FullyConnectedNode>(op);
-        if (!fc) {
-            errorMessage = "Only legacy FullyConnected operation is supported";
+        if (!ov::is_type<const ov::op::internal::FullyConnected>(op) &&
+            !ov::is_type<const ov::op::internal::FullyConnectedQuantizedLegacy>(op) &&
+            !ov::is_type<const ov::op::internal::FullyConnectedCompressed>(op)) {
             return false;
         }
-        if (fc->get_input_size() == 3 &&
-            std::dynamic_pointer_cast<const ov::op::v0::Constant>(fc->get_input_node_shared_ptr(BIAS_ID)) == nullptr) {
-            errorMessage = "Only Constant operation on 'bias' input is supported";
+
+        if (ov::is_type<const ov::op::internal::FullyConnected>(op)) {
+            if (!ov::op::util::is_on_constant_path(op->input_value(BIAS))) {
+                errorMessage = "Only Constant operation on 'bias' input is supported";
+                return false;
+            }
+        }
+
+        if (ov::is_type<const ov::op::internal::FullyConnectedCompressed>(op)) {
+            if (!ov::op::util::is_on_constant_path(op->input_value(WEIGHT_SCALES)) ||
+                !ov::op::util::is_on_constant_path(op->input_value(WEIGHT_ZERO_POINTS))) {
+                errorMessage = "Only Constant operation on 'weight scales', and 'weight zero points' inputs is supported";
+                return false;
+            }
+        }
+    } catch (...) {
+        return false;
+    }
+
+    return true;
+}
+
+// @todo replace 'inferencePrecision' check with 'fc->get_input_element_type(0) == ov::element::bf16'
+// after bf16 pipeline is moved to ConvertPrecision
+bool FullyConnected::isSupportedCompressedOperation(const std::shared_ptr<ov::Node>& op,
+                                                    size_t IC,
+                                                    size_t OC,
+                                                    size_t G,
+                                                    ov::element::Type inferencePrecision) noexcept {
+#if defined(OPENVINO_ARCH_X86_64)
+    try {
+        std::string errorMessage;
+        if (!isSupportedOperation(op, errorMessage))
             return false;
+
+        if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2))
+            return false;
+
+        if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) &&
+            inferencePrecision == ov::element::bf16) {
+            // OneDNN AMX IP implementation has limited shapes support due to performance considerations. As a
+            // current solution conditions below are copied from OneDNN to make sure correct IP impl will be
+            // used since fallback one doesn't support weights decompression feature.
+            size_t simdWidth = 16;
+            size_t vnniFactor = 2;
+            size_t maxSize = 512;
+            auto amxRow = vnniFactor * simdWidth;
+
+            if ((IC <= amxRow && OC <= amxRow) || (IC <= maxSize && OC <= maxSize && IC % amxRow != 0)) {
+                return false;
+            }
         }
-        const auto weightRank = fc->get_input_partial_shape(WEIGHTS_ID).size();
-        if (weightRank != 2) {
-            errorMessage = "Doesn't support 'weight' input with rank: " + std::to_string(weightRank);
+
+        if (IC % G != 0 || IC / G < 4 || OC == 1) {
             return false;
         }
+
+        return true;
     } catch (...) {
         return false;
     }
     return true;
+#else
+    return false;
+#endif
 }
 
 void FullyConnected::initTensorParallelConfig(const GraphContext::CPtr context) {
@@ -79,6 +136,31 @@ FullyConnected::FullyConnected(const std::shared_ptr<ov::Node>& op, const GraphC
     initTensorParallelConfig(context);
     if (!isSupportedOperation(op, errorMessage))
         OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage);
+
+    m_atoi[ARG_SRC]  = DATA;
+    m_atoi[ARG_WEI]  = WEIGHTS;
+    m_atoi[ARG_BIAS] = BIAS;
+
+    auto mapArgToInput = [&op](std::unordered_map<size_t, size_t>& argToInput, size_t argId, size_t inputId) {
+        if (op->get_input_size() > inputId &&
+            op->input(inputId).get_element_type() != ov::element::undefined) {
+            argToInput[argId] = inputId;
+        }
+    };
+
+    if (ov::is_type<const ov::op::internal::FullyConnectedCompressed>(op)) {
+        mapArgToInput(m_atoi, ARG_WEI | ARG_ATTR_SCALES,      WEIGHT_SCALES);
+        mapArgToInput(m_atoi, ARG_WEI | ARG_ATTR_ZERO_POINTS, WEIGHT_ZERO_POINTS);
+        algorithm = Algorithm::FullyConnectedCompressed;
+    } else if (ov::is_type<const ov::op::internal::FullyConnectedQuantizedLegacy>(op)) {
+        mapArgToInput(m_atoi, ARG_DST_DEQ_SCALE, 3);
+        algorithm = Algorithm::FullyConnectedQuantizedLegacy;
+    } else if (ov::is_type<const ov::op::internal::FullyConnectedQuantized>(op)) {
+        algorithm = Algorithm::FullyConnectedQuantized;
+        OPENVINO_THROW_NOT_IMPLEMENTED("FullyConnectedQuantized is not implemented yet");
+    } else {
+        algorithm = Algorithm::FullyConnectedCommon;
+    }
 }
 
 bool FullyConnected::canBeExecutedInInt8() const {
@@ -220,6 +302,7 @@ void FullyConnected::execTensorParallelSync() {
         }
     }
 }
+
 void FullyConnected::execute(dnnl::stream strm) {
     initTensorParallelSync();
 
@@ -366,31 +449,11 @@ static bool useSparseWeightsDecompression(const NodePtr& weightsInput,
     return sparseRate >= minSparseRate;
 }
 
-void FullyConnected::needUpdateDQScaleForTensorParallel(std::vector<float>& dequantizationScales) {
-    if (tp_cfg.enable_tensor_parallel) {
-        auto split_parts = [](int len, int n) {
-            int average = len / n;
-            std::vector<int> parts(n, average);
-            parts.back() = len - average * (n - 1);
-            return parts;
-        };
-        auto DQScales = getDQScales();
-        auto split_lens = split_parts(DQScales.size(), tp_cfg.w_size);
-        auto split_offset = tp_cfg.w_rank * split_lens[0];
-        std::vector<float> newDQScales(split_lens[tp_cfg.w_rank]);
-        std::copy(DQScales.begin() + split_offset, DQScales.begin() + split_offset + split_lens[tp_cfg.w_rank], newDQScales.begin());
-        dequantizationScales = std::move(newDQScales);
-    }
-}
-
 void FullyConnected::initSupportedPrimitiveDescriptors() {
-    attrs.withBias = getOriginalInputsNumber() == 3;
-
-    attrs.dequantizationScales = getDQScales();
-    needUpdateDQScaleForTensorParallel(attrs.dequantizationScales);
+    attrs.withBias = getOriginalInputPrecisionAtPort(BIAS) != ov::element::undefined;
 
-    attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS_ID)->getParent(),
-                                                        getOriginalInputPrecisionAtPort(DATA_ID),
+    attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS)->getParent(),
+                                                        getOriginalInputPrecisionAtPort(DATA),
                                                         context->getConfig().fcSparseWeiDecompressionRate);
     attrs.dynamicQuantizationGroupSize = context->getConfig().fcDynamicQuantizationGroupSize;
     attrs.modelType = context->getConfig().modelType;
@@ -406,6 +469,10 @@ void FullyConnected::initSupportedPrimitiveDescriptors() {
     VecMemoryDescs srcDescs;
     const auto& creatorsMap = BlockedDescCreator::getCommonCreators();
     for (size_t i = 0; i < srcTypes.size(); i++) {
+        if (srcTypes[i] == element::undefined) {
+            srcDescs.push_back(MemoryDescUtils::makeEmptyDesc());
+            continue;
+        }
         const auto srcDesc = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(srcTypes[i], getInputShapeAtPort(i));
         srcDescs.push_back(srcDesc);
     }
@@ -417,23 +484,31 @@ void FullyConnected::initSupportedPrimitiveDescriptors() {
     }
 
     MemoryDescArgs descs{
-        {ARG_SRC, srcDescs[0]},
-        {ARG_WEI, srcDescs[1]},
-        {ARG_BIAS, attrs.withBias ? srcDescs[2] : MemoryDescUtils::makeEmptyDesc()},
+        {ARG_SRC, srcDescs[DATA]},
+        {ARG_WEI, srcDescs[WEIGHTS]},
+        {ARG_BIAS, srcDescs[BIAS]},
         {ARG_DST, dstDescs[0]},
     };
 
-    needUpdateScaleForTensorParallel();
-    needUpdateZeroPointForTensorParallel();
-
     auto executionContext = std::make_shared<ExecutorContext>(context, getImplPriority(), privateWeightCache);
     factory = std::make_shared<ExecutorFactory<FCAttrs>>(attrs, postOps, executionContext, descs);
     const auto nodeDescriptors = factory->getProperMemoryDescriptors(descs);
 
     NodeConfig nodeConfig;
-    nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_SRC));
-    nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_WEI));
-    if (attrs.withBias) nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_BIAS));
+    nodeConfig.inConfs.resize(srcDescs.size());
+
+    for (const auto& desc : nodeDescriptors) {
+        if (m_atoi.count(desc.first)) {
+            nodeConfig.inConfs[m_atoi[desc.first]] = desc.second;
+        }
+    }
+
+    // add extra inputs bypassing proper memory descriptors
+    // @todo pass all the input descriptors to getProperMemoryDescriptors and allow
+    // to ignore extra input descriptors if necessery
+    for (size_t i = 3; i < srcDescs.size(); i++) {
+        nodeConfig.inConfs[i] = srcDescs[i];
+    }
 
     const int inPlace = canBeInPlace() ? 0 : -1;
     nodeConfig.outConfs.emplace_back(nodeDescriptors.at(ARG_DST), BlockedMemoryDesc::FULL_MASK, inPlace);
@@ -443,11 +518,11 @@ void FullyConnected::initSupportedPrimitiveDescriptors() {
 
 void FullyConnected::needSplitMemoryForTensorParallel() {
     if (tp_cfg.enable_tensor_parallel) {
-        auto src = getSrcMemoryAtPort(DATA_ID);
-        auto wgt = getSrcMemoryAtPort(WEIGHTS_ID);
+        auto src = getSrcMemoryAtPort(DATA);
+        auto wgt = getSrcMemoryAtPort(WEIGHTS);
         auto dst = getDstMemoryAtPort(0);
         // src
-        memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID);
+        memory[ARG_SRC] = getSrcMemoryAtPort(DATA);
         // wgt
         // split N direction
         tp_cfg.cached_splited_weight = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size)
@@ -455,7 +530,7 @@ void FullyConnected::needSplitMemoryForTensorParallel() {
         memory[ARG_WEI] = tp_cfg.cached_splited_weight;
         // bias
         if (attrs.withBias) {
-            auto bias = getSrcMemoryAtPort(BIAS_ID);
+            auto bias = getSrcMemoryAtPort(BIAS);
             auto select_bias = split_horizontal(context->getEngine(), std::move(bias), 0, tp_cfg.w_rank, tp_cfg.w_size);
             tp_cfg.cached_splited_bias = std::move(select_bias);
         } else {
@@ -465,6 +540,21 @@ void FullyConnected::needSplitMemoryForTensorParallel() {
         // dst
         memory[ARG_DST] = getDstMemoryAtPort(0);
         tp_cfg.cached_dst = split_horizontal(context->getEngine(), std::move(dst), -1, tp_cfg.w_rank, tp_cfg.w_size, false);
+
+        memory[ARG_DST | ARG_ATTR_SCALES] = split_horizontal(context->getEngine(), memory[ARG_DST | ARG_ATTR_SCALES], 0, tp_cfg.w_rank, tp_cfg.w_size);
+
+        auto scale_mem = std::const_pointer_cast<IMemory>(memory[ARG_WEI | ARG_ATTR_SCALES]);
+        memory[ARG_WEI | ARG_ATTR_SCALES] = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size)
+            : split_horizontal(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size);
+
+        auto zeropoint_mem = std::const_pointer_cast<IMemory>(memory[ARG_WEI | ARG_ATTR_ZERO_POINTS]);
+        auto element_num = zeropoint_mem->getSize() / zeropoint_mem->getPrecision().size();
+        if (element_num == 1) {
+            tp_cfg.cached_zeropoint = zeropoint_mem;
+        } else {
+            tp_cfg.cached_zeropoint = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size)
+                                : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size);
+        }
     }
 }
 
@@ -473,7 +563,7 @@ void FullyConnected::needUpdateTensorParalelConfig() {
     // 1. weight shape is dynamic
     // 2. last dim can be splited.
     if (tp_cfg.enable_tensor_parallel) {
-        auto& shape = getSrcMemoryAtPort(WEIGHTS_ID)->getShape();
+        auto& shape = getSrcMemoryAtPort(WEIGHTS)->getShape();
         if (shape.isDynamic()) {
             tp_cfg.enable_tensor_parallel = false;
         } else if (shape.getDims()[0] < static_cast<size_t>(tp_cfg.w_size)) {
@@ -481,12 +571,16 @@ void FullyConnected::needUpdateTensorParalelConfig() {
         }
     }
 }
+
 void FullyConnected::createPrimitive() {
     needUpdateTensorParalelConfig();
 
-    memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID);
-    memory[ARG_WEI] = getSrcMemoryAtPort(WEIGHTS_ID);
-    memory[ARG_BIAS] = attrs.withBias ? getSrcMemoryAtPort(BIAS_ID) : MemoryDescUtils::makeEmptyMemory(context);
+    for (const auto& entry : m_atoi) {
+        const auto argumentId = entry.first;
+        const auto inputId = entry.second;
+        memory[argumentId] = getSrcMemoryAtPort(inputId);
+    }
+
     memory[ARG_DST] = getDstMemoryAtPort(0);
 
     needSplitMemoryForTensorParallel();
@@ -513,49 +607,6 @@ ov::element::Type FullyConnected::getRuntimePrecision() const {
     return getMaxPrecision(srcTypes);
 }
 
-void FullyConnected::needUpdateScaleForTensorParallel() {
-    if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_scale) {
-        attrs.decompressionMultiplyPtr = tp_cfg.cached_scale;
-    }
-}
-
-void FullyConnected::needSplitScaleForTensorParallel(const MemoryCPtr& memory) {
-    if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_scale) {
-        auto scale_mem = std::const_pointer_cast<IMemory>(memory);
-        tp_cfg.cached_scale = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size)
-                       : split_horizontal(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size);
-    }
-}
-
-void FullyConnected::needUpdateZeroPointForTensorParallel() {
-    if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_zeropoint) {
-        attrs.decompressionSubtractPtr = tp_cfg.cached_zeropoint;
-    }
-}
-
-void FullyConnected::needSplitZeroPointForTensorParallel(const MemoryCPtr& memory) {
-    if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_zeropoint) {
-        auto zeropoint_mem = std::const_pointer_cast<IMemory>(memory);
-        auto element_num = memory->getSize() / memory->getPrecision().size();
-        if (element_num == 1) {
-            tp_cfg.cached_zeropoint = std::move(zeropoint_mem);
-        } else {
-            tp_cfg.cached_zeropoint = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size)
-                                : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size);
-        }
-    }
-}
-
-void FullyConnected::fuseDecompressionMultiply(const MemoryCPtr& memory) {
-    attrs.decompressionMultiplyPtr = memory;
-    needSplitScaleForTensorParallel(memory);
-}
-
-void FullyConnected::fuseDecompressionSubtract(const MemoryCPtr& memory) {
-    attrs.decompressionSubtractPtr = memory;
-    needSplitZeroPointForTensorParallel(memory);
-}
-
 }  // namespace node
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index 8c17228e365af4..177edd3d426339 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -6,9 +6,11 @@
 
 #include <node.h>
 
+#include <cstddef>
 #include <memory>
 #include <oneapi/dnnl/dnnl.hpp>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "cpu_memory.h"
@@ -65,6 +67,15 @@ class FullyConnected : public Node {
     bool canFuse(const NodePtr& node) const override;
 
     static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
+    static bool isSupportedCompressedOperation(const std::shared_ptr<ov::Node>& op,
+                                               size_t IC,
+                                               size_t OC,
+                                               size_t G,
+                                               ov::element::Type inferencePrecision) noexcept;
+
+    bool isExecutable() const override {
+        return !isInputTensorAtPortEmpty(0);
+    }
 
     void prepareParams() override;
     void executeDynamicImpl(dnnl::stream strm) override;
@@ -80,9 +91,21 @@ class FullyConnected : public Node {
     void toNumaNodeImpl(int numaID) override;
 
 private:
-    static const size_t DATA_ID = 0;
-    static const size_t WEIGHTS_ID = 1;
-    static const size_t BIAS_ID = 2;
+    enum InputId : size_t {
+        DATA = 0,
+        WEIGHTS,
+        BIAS,
+        WEIGHT_SCALES,
+        WEIGHT_ZERO_POINTS,
+        INPUT_SCALES,
+        INPUT_ZERO_POINTS,
+        OUTPUT_SCALES,
+        OUTPUT_ZERO_POINTS,
+    };
+
+    static bool isConstantInput(const std::shared_ptr<const ov::Node>& op, InputId port);
+
+    std::unordered_map<size_t, size_t> m_atoi; // memory argument id to input id
 
     void fuseDecompressionConstant(const MemoryCPtr& memory, MemoryCPtr& decompressionValuesPtr);
 
@@ -92,11 +115,6 @@ class FullyConnected : public Node {
     void initTensorParallelSync();
     void execTensorParallelSync();
     void needSplitMemoryForTensorParallel();
-    void needSplitScaleForTensorParallel(const MemoryCPtr& memory);
-    void needUpdateScaleForTensorParallel();
-    void needSplitZeroPointForTensorParallel(const MemoryCPtr& memory);
-    void needUpdateZeroPointForTensorParallel();
-    void needUpdateDQScaleForTensorParallel(std::vector<float>& dequantizationScales);
 
     FCAttrs attrs;
     PostOps postOps;
diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp
index 1f650bd8c5de17..4ccdc87ada25f1 100644
--- a/src/plugins/intel_cpu/src/nodes/input.cpp
+++ b/src/plugins/intel_cpu/src/nodes/input.cpp
@@ -7,7 +7,10 @@
 #include "cpu/x64/jit_generator.hpp"
 #include "nodes/node_config.h"
 #include "openvino/core/parallel.hpp"
+#include "openvino/core/shape.hpp"
+#include "openvino/core/type/element_type.hpp"
 #include "shape_inference/shape_inference_pass_through.hpp"
+#include "memory_desc/cpu_memory_desc_utils.h"
 
 using namespace dnnl;
 using namespace dnnl::impl::cpu::x64;
@@ -228,9 +231,9 @@ Input::Input(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr conte
                                        op->get_type_name(),
                                        " with name ",
                                        op->get_friendly_name());
-    constOp = ov::as_type_ptr<op::v0::Constant>(op);
-    if (constOp) {
+    if (auto constOp = ov::as_type_ptr<op::v0::Constant>(op)) {
         constant = ConstantType::Const;
+        m_constOp = constOp;
         cloneBlobIfRequired();
     } else {
         constant = ConstantType::StrictNoConst;
@@ -238,8 +241,14 @@ Input::Input(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr conte
 }
 
 void Input::cloneBlobIfRequired() {
-    Shape shape(constOp->get_shape().empty() ? ov::Shape(1, 1) : constOp->get_shape());
-    const auto prec = constOp->get_element_type();
+    const auto prec = m_constOp->get_element_type();
+
+    if (prec == ov::element::undefined && shape_size(m_constOp->get_shape()) == 0) {
+        memoryPtr = MemoryDescUtils::makeEmptyMemory(context);
+        return;
+    }
+
+    Shape shape(m_constOp->get_shape().empty() ? ov::Shape(1, 1) : m_constOp->get_shape());
     const size_t size = shape.getElementsCount();
     CpuBlockedMemoryDesc memDesc(prec, shape);
 
@@ -258,21 +267,21 @@ void Input::cloneBlobIfRequired() {
         // oneDNN always allocate 1byte for element type with bitWidth < 8 (u4,u1...)
         // but ngraph Constant uses actual bitWidth for data storage allocation
         // in that case we make a copy to avoid overflow
-        if (constOp->get_byte_size() >= memDesc.getCurrentMemSize()) {
-            if (constOp->get_element_type() == element::string) {
-                memory = std::make_shared<StringMemory>(getEngine(), memDesc, constOp->get_data_ptr<element::string>());
+        if (m_constOp->get_byte_size() >= memDesc.getCurrentMemSize()) {
+            if (m_constOp->get_element_type() == element::string) {
+                memory = std::make_shared<StringMemory>(getEngine(), memDesc, m_constOp->get_data_ptr<element::string>());
             } else {
-                memory = std::make_shared<Memory>(getEngine(), memDesc, constOp->get_data_ptr());
+                memory = std::make_shared<Memory>(getEngine(), memDesc, m_constOp->get_data_ptr());
             }
         } else {
-            if (constOp->get_element_type() == element::string) {
+            if (m_constOp->get_element_type() == element::string) {
                 memory = std::make_shared<StringMemory>(getEngine(), memDesc);
-                auto src = constOp->get_data_ptr<StringMemory::OvString>();
+                auto src = m_constOp->get_data_ptr<StringMemory::OvString>();
                 auto dst = memory->getDataAs<StringMemory::OvString>();
                 std::copy(src, src + size, dst);
             } else {
                 memory = std::make_shared<Memory>(getEngine(), memDesc);
-                memcpy(memory->getData(), constOp->get_data_ptr(), constOp->get_byte_size());
+                memcpy(memory->getData(), m_constOp->get_data_ptr(), m_constOp->get_byte_size());
             }
         }
 
@@ -287,22 +296,22 @@ void Input::cloneBlobIfRequired() {
         return ptr;
     };
 
-    auto isBlobAligned = [&] () {
-        bool blobAlignedOnSSE = true;
+    auto isBlobAligned = [] (const std::shared_ptr<ov::op::v0::Constant>& constant) {
 #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
         // Majority of arithmetic and data processing instructions in legacy SSE isa requires
         // the memory address in the operands must be aligned on 16-byte boundary. To ensure
         // safely reusing ngraph const blob memory, need to check address alignment.
-        const void *ptr = constOp->get_data_ptr();
-        blobAlignedOnSSE = mayiuse(cpu_isa_t::avx2) || ((reinterpret_cast<uintptr_t>(ptr) & 15) == 0);
+        const void *ptr = constant->get_data_ptr();
+        return mayiuse(cpu_isa_t::avx2) || ((reinterpret_cast<uintptr_t>(ptr) & 15) == 0);
+#else
+        return true;
 #endif
-        return blobAlignedOnSSE;
     };
 
     // The presence of subnormals is better to determined at IR read time.
     auto hasSubnormals = [&] () {
         if (prec == ov::element::f32) {
-            uint32_t const *u32data = constOp->get_data_ptr<uint32_t>();
+            uint32_t const *u32data = m_constOp->get_data_ptr<uint32_t>();
 
             if (!size)
                 return false;
@@ -345,7 +354,7 @@ void Input::cloneBlobIfRequired() {
 
     auto blobKey = [&] () {
         char ptr[32];
-        snprintf(ptr, sizeof ptr, "%p", constOp->get_data_ptr());
+        snprintf(ptr, sizeof ptr, "%p", m_constOp->get_data_ptr());
         return getName()
                 + "_" + std::to_string(size * prec.size())
                 + "_" + ptr;
@@ -356,12 +365,13 @@ void Input::cloneBlobIfRequired() {
         prec != element::string &&
         // IRs already have all subnormals flushed to zero, but in
         // read_model scenario with directly loaded original model still can have subnormals
-        isBlobAligned() && (!needFlushDenormalsToZero || !hasSubnormals()) &&
+        isBlobAligned(m_constOp) && (!needFlushDenormalsToZero || !hasSubnormals()) &&
         // Blob should be cloned in cache only if original weights are stored on other numa node.
         // This is possible only in multistream case on multisocket machine.
         // TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where original weights are stored.
         (!weightCache || context->getNumNumaNodes() == 1 || context->getCPUStreamExecutor()->get_streams_num() == 1);
-    memoryPtr = clone_is_not_needed ? std::make_shared<Memory>(getEngine(), memDesc, constOp->get_data_ptr())
+
+    memoryPtr = clone_is_not_needed ? std::make_shared<Memory>(getEngine(), memDesc, m_constOp->get_data_ptr())
                                     : std::const_pointer_cast<const IMemory>(
                                           weightCache ? *weightCache->findOrCreate(blobKey(), cloneBlob) : cloneBlob());
 }
diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h
index 4d7febb17ad4b7..e659ea2359aabd 100644
--- a/src/plugins/intel_cpu/src/nodes/input.h
+++ b/src/plugins/intel_cpu/src/nodes/input.h
@@ -75,7 +75,7 @@ class Input : public Node {
     void initSupportedPdFromMemDesc();
 
 private:
-    std::shared_ptr<ov::op::v0::Constant> constOp;
+    std::shared_ptr<ov::op::v0::Constant> m_constOp;
     MemoryCPtr memoryPtr;
     bool isMeanImage = false;
     MemoryDescPtr extMemDesc = nullptr;
diff --git a/src/plugins/intel_cpu/src/nodes/reference.cpp b/src/plugins/intel_cpu/src/nodes/reference.cpp
index 5dc7c8818dd52b..b84836c869deb3 100644
--- a/src/plugins/intel_cpu/src/nodes/reference.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reference.cpp
@@ -29,7 +29,7 @@ Reference::Reference(const std::shared_ptr<ov::Node>& op,
     : Node(op, context, ReferenceShapeInferFactory(op)), ovCoreNode(op), additionalErrorMessage(errorMessage) {
     if (!op->has_evaluate()) {
         OPENVINO_THROW_NOT_IMPLEMENTED(
-            "Cannot fallback on ngraph reference implementation (Ngraph::Node::evaluate() is not implemented)");
+            "Cannot fallback on ngraph reference implementation. Ngraph::Node::evaluate() is not implemented for op: ", *op);
     }
 
     setType(Type::Reference);
diff --git a/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp b/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp
index 5aef73df1949bd..048b413b61a60b 100644
--- a/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp
@@ -15,7 +15,7 @@ Result FCShapeInfer::infer(
     const VectorDims& activationShape = input_shapes[0].get();
     const VectorDims& weightShape = input_shapes[1].get();
     size_t activationRank = activationShape.size();
-    size_t channelRank = weightShape.size() - 1;
+    size_t channelRank = 1;
 
     // activation   weight    output_shape
     // NCHW         CoCHW     NCo
@@ -23,7 +23,7 @@ Result FCShapeInfer::infer(
     // NC           CoC       NCo
     VectorDims outputShape(out_rank, 1);
     // set Co
-    outputShape.back() = weightShape[0];
+    outputShape.back() = std::accumulate(weightShape.begin(), weightShape.end() - 1, 1, std::multiplies<Dim>());
     // set batch dims
     size_t batchRank = activationRank - channelRank;
     size_t startIdx = out_rank - batchRank - 1;
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.cpp
deleted file mode 100644
index a6d97b6a84b613..00000000000000
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "fully_connected.hpp"
-#include "transformations/itt.hpp"
-
-ov::intel_cpu::FullyConnectedNode::FullyConnectedNode(const ov::Output<Node>& A,
-                                                     const ov::Output<Node>& B,
-                                                     const ov::Rank& output_rank,
-                                                     const ov::element::Type output_type)
-    : Op({A, B}), m_output_rank(output_rank), m_output_type(output_type) {
-    validate_and_infer_types();
-}
-
-std::shared_ptr<ov::Node> ov::intel_cpu::FullyConnectedNode::clone_with_new_inputs(const ov::OutputVector& new_args) const {
-    INTERNAL_OP_SCOPE(FullyConnectedNode_clone_with_new_inputs);
-    check_new_args_count(this, new_args);
-
-    return std::make_shared<ov::intel_cpu::FullyConnectedNode>(new_args.at(0), new_args.at(1), m_output_rank, m_output_type);
-}
-
-void ov::intel_cpu::FullyConnectedNode::validate_and_infer_types() {
-    INTERNAL_OP_SCOPE(FullyConnectedNode_validate_and_infer_types);
-    const auto input_size = get_input_size();
-    NODE_VALIDATION_CHECK(this,
-        input_size == 2,
-        "Number of inputs is incorrect. Current value is: ",
-        input_size,
-        ", expected: 2.");
-
-    // Weights shape: [O, I1, ..., Im];
-    // O - output channels dimensions, Ik - input channels dimensions
-    const auto weights_pshape = get_input_partial_shape(1);
-    NODE_VALIDATION_CHECK(this,
-        weights_pshape.is_static(),
-        "Weights pshape must be static");
-    const auto weights_shape = weights_pshape.to_shape();
-
-    NODE_VALIDATION_CHECK(this,
-        weights_pshape.size() > 0,
-        "Weights rank must be greater than 0");
-
-    const auto o_channels = weights_pshape[0];
-
-    // Activations shape: [B1, ..., Bn, I1, ..., Im];
-    // Bi - batch dimensions, Ik - input channels dimensions
-    const auto activations_pshape = get_input_partial_shape(0);
-
-    // Result shape: [B1, ..., Bn, O]
-    ov::PartialShape output_pshape;
-    if (activations_pshape.rank().is_static()) {
-        size_t output_channels_dimensions_count = weights_shape.size() - 1;
-        for (size_t i = 0; i < activations_pshape.size() - output_channels_dimensions_count; ++i) {
-            output_pshape.push_back(activations_pshape[i]);
-        }
-        output_pshape.push_back(o_channels);
-
-        NODE_VALIDATION_CHECK(this,
-            m_output_rank.is_static(),
-            "Output rank must be static if activations rank is static.");
-
-        while (output_pshape.rank().get_length() < m_output_rank.get_length()) {
-            output_pshape.insert(output_pshape.begin(), 1);
-        }
-    } else {
-        output_pshape = ov::PartialShape::dynamic();
-    }
-
-    auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type;
-    set_output_type(0, output_type, output_pshape);
-}
-
-bool ov::intel_cpu::FullyConnectedNode::visit_attributes(ov::AttributeVisitor &visitor) {
-    INTERNAL_OP_SCOPE(FullyConnectedNode_visit_attributes);
-    visitor.on_attribute("out-rank", m_output_rank);
-    visitor.on_attribute("out-type", m_output_type);
-    return true;
-}
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.hpp
deleted file mode 100644
index d992b76cf0b79b..00000000000000
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/fully_connected.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "openvino/core/node.hpp"
-#include "openvino/op/op.hpp"
-
-namespace ov {
-namespace intel_cpu {
-
-class FullyConnectedNode : public ov::op::Op {
-public:
-    OPENVINO_OP("FullyConnected", "cpu_plugin_opset");
-
-    FullyConnectedNode() = default;
-
-    FullyConnectedNode(const ov::Output<Node> &A,
-                       const ov::Output<Node> &B,
-                       const ov::Rank& output_rank,
-                       const ov::element::Type output_type = ov::element::undefined);
-
-    bool visit_attributes(ov::AttributeVisitor &visitor) override;
-
-    void validate_and_infer_types() override;
-
-    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
-
-    ov::Rank get_output_rank() const { return m_output_rank; }
-    ov::element::Type get_output_type() const { return m_output_type; }
-
-private:
-    ov::Rank m_output_rank;
-    ov::element::Type m_output_type;
-};
-
-}   // namespace intel_cpu
-}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp
index f2861843a81110..da25e9aac30240 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp
@@ -2,12 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "transformations/cpu_opset/common/op/fully_connected.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "ov_ops/fully_connected.hpp"
 #include "convert_matmul_to_fc.hpp"
 #include "openvino/op/matmul.hpp"
 #include "openvino/op/convert.hpp"
 #include "openvino/op/transpose.hpp"
-#include "openvino/op/reshape.hpp"
 #include "openvino/core/rt_info.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "transformations/utils/utils.hpp"
@@ -135,22 +135,6 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() {
             OPENVINO_THROW("MatMul " + matmul->get_friendly_name() + " shapes are inconsistent.");
         }
 
-        // Transferring from MatMul representation: [B, I, K] * [B, K, O] = [B, I, O]
-        // to FullyConnected representation: [I, K] * [K, O] = [I, O]
-
-        if (rank_b != 2) {
-            ov::Dimension K = *(shape_b_aligned.rbegin() + 1);
-            OPENVINO_ASSERT(K.is_static());
-            auto k_len = K.get_length();
-            auto reshape_shape_values = matmul->get_transpose_b() ? std::vector<int64_t>{-1, k_len} : std::vector<int64_t>{k_len, -1};
-            auto reshape_shape = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, reshape_shape_values);
-            fc_input_b = ov::op::util::make_try_fold<ov::op::v1::Reshape>(fc_input_b, reshape_shape, false);
-            if (!std::dynamic_pointer_cast<ov::op::v0::Constant>(fc_input_b.get_node_shared_ptr())) {
-                new_ops.push_back(reshape_shape);
-            }
-            new_ops.push_back(fc_input_b.get_node_shared_ptr());
-        }
-
         // Weights normalization
         if (!matmul->get_transpose_b()) {
             fc_input_b = create_transpose(fc_input_b, matmul->get_friendly_name() + "/transpose_b");
@@ -169,10 +153,14 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() {
             fc_input_b = convert;
         }
 
-        // Create FullyConnected
-        auto output_rank = matmul->get_output_partial_shape(0).rank();
-        auto fc = std::make_shared<ov::intel_cpu::FullyConnectedNode>(fc_input_a, fc_input_b, output_rank,
-                matmul->get_output_element_type(0));
+        auto bias = std::make_shared<ov::op::v0::Constant>(element::undefined, Shape{0});
+        new_ops.push_back(bias);
+
+        auto fc = std::make_shared<ov::op::internal::FullyConnected>(fc_input_a,
+                                                                     fc_input_b,
+                                                                     bias,
+                                                                     matmul->get_output_element_type(0));
+
         fc->set_friendly_name(matmul->get_friendly_name());
         ///todo: CVS-130863 Remove after fp16_compression is copyable
         if (ov::fp16_compression_is_disabled(matmul))
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.hpp
index 69991802101138..7d75fcc19170d0 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.hpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "openvino/pass/graph_rewrite.hpp"
+#include "openvino/pass/matcher_pass.hpp"
 
 namespace ov {
 namespace intel_cpu {
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp
index 8079286d1e3ad7..03d9a294bbcab9 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp
@@ -12,7 +12,7 @@
 #include "openvino/pass/pattern/op/or.hpp"
 #include "transformations/rt_info/dequantization_node.hpp"
 #include "transformations/cpu_opset/common/op/power_static.hpp"
-#include "transformations/cpu_opset/common/op/fully_connected.hpp"
+#include "ov_ops/fully_connected.hpp"
 #include "utils/general_utils.h"
 
 #include "itt.hpp"
@@ -47,16 +47,16 @@ bool isConvertableToPowerStatic(const std::shared_ptr<BaseOp> &node) {
     return ov::shape_size(const_shape) == 1 &&
            input_rank.get_length() >= static_cast<int64_t>(const_shape.size()) &&
            !ov::intel_cpu::one_of(node->get_input_node_shared_ptr(nonConstPort)->get_type_info(),
-                                 ov::opset1::NormalizeL2::get_type_info_static(),
-                                 ov::opset4::Interpolate::get_type_info_static(),
-                                 ov::opset1::Convolution::get_type_info_static(),
-                                 ov::opset1::GroupConvolution::get_type_info_static(),
-                                 ov::opset1::ConvolutionBackpropData::get_type_info_static(),
-                                 ov::opset1::GroupConvolutionBackpropData::get_type_info_static(),
-                                 ov::opset1::MatMul::get_type_info_static(),
-                                 ov::intel_cpu::FullyConnectedNode::get_type_info_static(),
-                                 ov::op::v0::MVN::get_type_info_static(),
-                                 ov::opset6::MVN::get_type_info_static());
+                                  ov::opset1::NormalizeL2::get_type_info_static(),
+                                  ov::opset4::Interpolate::get_type_info_static(),
+                                  ov::opset1::Convolution::get_type_info_static(),
+                                  ov::opset1::GroupConvolution::get_type_info_static(),
+                                  ov::opset1::ConvolutionBackpropData::get_type_info_static(),
+                                  ov::opset1::GroupConvolutionBackpropData::get_type_info_static(),
+                                  ov::opset1::MatMul::get_type_info_static(),
+                                  ov::op::internal::FullyConnected::get_type_info_static(),
+                                  ov::op::v0::MVN::get_type_info_static(),
+                                  ov::opset6::MVN::get_type_info_static());
 }
 
 template <>
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp
new file mode 100644
index 00000000000000..d92d2d3627b65b
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "fc_bias_fusion.hpp"
+
+#include <cstdint>
+#include <memory>
+
+#include "itt.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "ov_ops/fully_connected.hpp"
+#include "transformations/utils/utils.hpp"
+
+ov::intel_cpu::FullyConnectedBiasFusion::FullyConnectedBiasFusion() {
+    MATCHER_SCOPE(FullyConnectedBiasFusion);
+
+    auto input = ov::pass::pattern::any_input(ov::pass::pattern::has_static_rank());
+    auto weights = ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape());
+    auto bias = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
+    auto m_fc = ov::pass::pattern::wrap_type<ov::op::internal::FullyConnected>({input, weights, bias},
+                                                                               ov::pass::pattern::consumers_count(1));
+    auto m_bias = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
+    auto m_add = ov::pass::pattern::wrap_type<ov::op::v1::Add>({m_fc, m_bias});
+
+    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+        auto& pattern_to_output = m.get_pattern_value_map();
+
+        auto add = pattern_to_output[m_add].get_node_shared_ptr();
+        auto bias = pattern_to_output[m_bias].get_node_shared_ptr();
+
+        auto fc = pattern_to_output[m_fc].get_node_shared_ptr();
+
+        if (transformation_callback(fc)) {
+            return false;
+        }
+
+        ov::Shape bias_shape(bias->get_shape());
+        const ov::PartialShape& output_shape = fc->get_output_partial_shape(0);
+        size_t bias_size = ov::shape_size(bias_shape);
+        auto rank = output_shape.size();
+        if (rank == 0 || output_shape[rank - 1].is_dynamic()) {
+            return false;
+        }
+
+        if (bias_shape.empty() || static_cast<int64_t>(bias_shape.back()) != output_shape[rank - 1].get_length() ||
+            bias_shape.back() != bias_size) {
+            return false;
+        }
+
+        ov::NodeVector new_ops;
+
+        std::shared_ptr<ov::Node> final_bias = bias;
+        if (bias_shape.size() >= 2) {
+            auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1});
+            final_bias = ov::op::util::make_try_fold<ov::op::v1::Reshape>(final_bias, reshape_const, true);
+            new_ops.push_back(final_bias);
+        }
+
+        std::shared_ptr<ov::Node> fc_with_bias;
+
+        auto fc_node = ov::as_type_ptr<ov::op::internal::FullyConnected>(fc);
+        fc_with_bias = fc_node->clone_with_new_inputs({fc_node->input_value(0), fc_node->input_value(1), final_bias});
+
+        new_ops.push_back(fc_with_bias);
+
+        fc_with_bias->set_friendly_name(add->get_friendly_name());
+        ov::copy_runtime_info({fc, add}, new_ops);
+        ov::replace_node(add, fc_with_bias);
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(m_add, matcher_name);
+    this->register_matcher(m, callback);
+}
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp
new file mode 100644
index 00000000000000..b21cf80ad327e6
--- /dev/null
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class FullyConnectedBiasFusion : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("FullyConnectedBiasFusion", "0");
+    FullyConnectedBiasFusion();
+};
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp
index e681cd48ce8087..18a54dc45e173f 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "transformations/cpu_opset/common/op/fully_connected.hpp"
+#include "ov_ops/fully_connected.hpp"
 #include "move_fc_reshape_to_weights.hpp"
 #include <transformations/utils/utils.hpp>
 #include <openvino/pass/pattern/op/wrap_type.hpp>
@@ -48,7 +48,8 @@ ov::intel_cpu::MoveFCReshapeToWeights::MoveFCReshapeToWeights() {
     auto weights_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{reshape_m, transpose_m});
 
     auto data_m = any_input();
-    auto fully_connected_m = wrap_type<ov::intel_cpu::FullyConnectedNode>({data_m, weights_input_m});
+    auto bias_m = any_input();
+    auto fully_connected_m = wrap_type<ov::op::internal::FullyConnected>({data_m, weights_input_m, bias_m});
 
     ov::matcher_pass_callback callback = [&](ov::pass::pattern::Matcher& m) {
         const auto fully_connected = m.get_match_root();
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.cpp
deleted file mode 100644
index 27207b3e051fdb..00000000000000
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "openvino/core/rt_info.hpp"
-#include "openvino/pass/pattern/op/wrap_type.hpp"
-#include "openvino/pass/constant_folding.hpp"
-#include <transformations/utils/utils.hpp>
-#include "openvino/op/concat.hpp"
-#include "openvino/op/constant.hpp"
-#include "openvino/op/convert.hpp"
-#include "openvino/op/multiply.hpp"
-#include "openvino/op/reshape.hpp"
-#include "openvino/op/subtract.hpp"
-#include "openvino/op/transpose.hpp"
-#include "openvino/op/variadic_split.hpp"
-#include "transformations/cpu_opset/common/op/fully_connected.hpp"
-
-#include "split_fc.hpp"
-
-#include "itt.hpp"
-
-ov::intel_cpu::SplitFC::SplitFC(int sub_stream_num) {
-    MATCHER_SCOPE(SplitFC);
-    auto fc_m = ov::pass::pattern::wrap_type<ov::intel_cpu::FullyConnectedNode>();
-
-    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
-        const auto& pattern_map = m.get_pattern_value_map();
-
-        const auto& fc_node = pattern_map.at(fc_m).get_node_shared_ptr();
-        auto& rt_info = fc_node->get_rt_info();
-        if (rt_info.count("parallelDomain")) {
-            return false;
-        }
-
-        const auto src_item = fc_node->get_input_node_shared_ptr(0);
-        const auto fc_weight_node = fc_node->get_input_node_shared_ptr(1);
-
-        // split happens on the first dimension.
-        constexpr size_t split_dim = 0;
-        auto split_dim_node = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, split_dim);
-
-        // needn't to split fc when the dim is 0.
-        const auto& wgt_shape = fc_weight_node->get_shape();
-        // weight shape size 660000 is a trade-off value, which is summarized and verified by LLMs.
-        if (wgt_shape[split_dim] <= 1 || ov::shape_size(wgt_shape) < 6600000) {
-            return false;
-        }
-
-        // parts will be splited according the sub stream num.
-        int split_num = sub_stream_num + 1;
-
-        auto split_parts = [](int len, int n) {
-            int average = len / n;
-            std::vector<int> parts(n, average);
-            parts.back() = len - average * (n - 1);
-            return parts;
-        };
-
-        // TODO: support transpose
-        if (ov::is_type<ov::op::v1::Transpose>(fc_weight_node)) {
-            return false;
-        }
-
-        // 1. If the model is INT4 format, split the INT4 pattern for the FuseFCAndWeightsDecompression.
-        // 2. If the model is NOT INT4 format, split the weight.
-        std::vector<ov::Output<ov::Node>> wgt_node_vec(split_num);
-        if (ov::is_type<ov::op::v1::Multiply>(fc_weight_node) || ov::is_type<ov::op::v1::Reshape>(fc_weight_node)) {
-            // INT4 model should consider two patterns, including with Reshape Node and without Reshape Node.
-            const auto reshape_node = ov::as_type_ptr<ov::op::v1::Reshape>(fc_weight_node);
-            const auto multiply_node = reshape_node ? reshape_node->get_input_node_shared_ptr(0) : fc_weight_node;
-            if (!ov::is_type<ov::op::v1::Multiply>(multiply_node)) {
-                return false;
-            }
-            auto multiply_pattern = multiply_node->get_input_node_shared_ptr(1);
-            if (!ov::is_type<ov::op::v0::Constant>(multiply_pattern)) {
-                return false;
-            }
-            auto subtract_node = multiply_node->get_input_node_shared_ptr(0);
-            if (!ov::is_type<ov::op::v1::Subtract>(subtract_node)) {
-                return false;
-            }
-            auto convert_node1 = subtract_node->get_input_node_shared_ptr(1);
-            if (!ov::is_type<ov::op::v0::Convert>(convert_node1)) {
-                return false;
-            }
-            auto convert_node1_const = ov::as_type_ptr<ov::op::v0::Constant>(convert_node1->get_input_node_shared_ptr(0));
-            if (!convert_node1_const) {
-                return false;
-            }
-            auto convert_node0 = subtract_node->get_input_node_shared_ptr(0);
-            if (!ov::is_type<ov::op::v0::Convert>(convert_node0)) {
-                return false;
-            }
-            auto wgt_item = convert_node0->get_input_node_shared_ptr(0);
-            auto cvt_prec = convert_node0->get_element_type();
-
-            auto split_dim_range = wgt_item->get_shape()[split_dim];
-            const auto& convert_node1_shape = convert_node1->get_shape();
-            bool need_to_split_convert = ov::shape_size(convert_node1_shape) > 1 &&
-                                         split_dim < convert_node1_shape.size() &&
-                                         convert_node1_shape[split_dim] == split_dim_range;
-
-            // We should use VariadicSplit to split the input for FC.
-            std::vector<std::vector<int32_t>> split_reshape_pattern_vec(split_num);
-            auto fc_dim_vec = split_parts(split_dim_range, split_num);
-            auto split_length = ov::op::v0::Constant::create<int32_t>(ov::element::i32, ov::Shape{static_cast<size_t>(split_num)}, fc_dim_vec);
-
-            auto split_constants = [&](const std::shared_ptr<ov::Node>& constant) {
-                static const std::set<ov::element::Type> unsupported_by_split_element_types{ov::element::u4, ov::element::i4, ov::element::nf4};
-                const auto& constant_precision = constant->get_output_element_type(0);
-                if (unsupported_by_split_element_types.count(constant_precision) == 0) {
-                    auto split = std::make_shared<ov::op::v1::VariadicSplit>(constant, split_dim_node, split_length);
-                    return split->outputs();
-                }
-
-                auto convert = std::make_shared<ov::op::v0::Convert>(constant, ov::element::i8);
-                auto split = std::make_shared<ov::op::v1::VariadicSplit>(convert, split_dim_node, split_length);
-                ov::OutputVector res(split->get_output_size());
-                for (size_t i = 0; i < split->get_output_size(); ++i) {
-                    res[i] = std::make_shared<ov::op::v0::Convert>(split->output(i), constant_precision);
-                }
-                return res;
-            };
-
-            auto split_wgts = split_constants(wgt_item);
-            auto split_muls = split_constants(multiply_pattern);
-            ov::OutputVector split_cvts;
-            if (need_to_split_convert) {
-                split_cvts = split_constants(convert_node1_const);
-            }
-
-            if (reshape_node) {
-                auto reshape_pattern = reshape_node->get_input_node_shared_ptr(1);
-                auto reshape_const = ov::as_type_ptr<ov::op::v0::Constant>(reshape_pattern);
-                if (!reshape_const) {
-                    return false;
-                }
-                const auto reshape_vec = reshape_const->cast_vector<int32_t>();
-                for (int i = 0; i < split_num; ++i) {
-                    split_reshape_pattern_vec[i] = {fc_dim_vec[i], reshape_vec[1]};
-                }
-            }
-
-            std::vector<ov::Output<ov::Node>> zp_const_vec(split_num);
-            for (int i = 0; i < split_num; ++i) {
-                zp_const_vec[i] = need_to_split_convert ? split_cvts[i] : convert_node1_const->clone_with_new_inputs({});
-            }
-
-            for (int i = 0; i < split_num; ++i) {
-                auto sub_parent0 = std::make_shared<ov::op::v0::Convert>(split_wgts[i], cvt_prec);
-                auto sub_parent1 = std::make_shared<ov::op::v0::Convert>(zp_const_vec[i], cvt_prec);
-                ov::pass::disable_constant_folding(sub_parent0);
-                ov::pass::disable_constant_folding(sub_parent1);
-                auto sub_node = std::make_shared<ov::op::v1::Subtract>(sub_parent0, sub_parent1);
-
-                auto mul_node = std::make_shared<ov::op::v1::Multiply>(sub_node, split_muls[i]);
-                if (reshape_node) {
-                    auto reshape_pattern = ov::op::v0::Constant::create<int32_t>(ov::element::i32, ov::Shape{2}, split_reshape_pattern_vec[i]);
-                    wgt_node_vec[i] = std::make_shared<ov::op::v1::Reshape>(mul_node, reshape_pattern, reshape_node->get_special_zero());
-                } else {
-                    wgt_node_vec[i] = mul_node;
-                }
-            }
-        } else {
-            // get input
-            auto wgt_item = fc_node->get_input_node_shared_ptr(1);
-
-            // split weight
-            auto split_dim_range = wgt_item->get_shape()[split_dim];
-
-            // We should use VariadicSplit to split input for FC.
-            auto fc_dim_vec = split_parts(split_dim_range, split_num);
-            auto split_length = ov::op::v0::Constant::create<int32_t>(ov::element::i32, ov::Shape{static_cast<size_t>(split_num)}, fc_dim_vec);
-            auto split_wgts = std::make_shared<ov::op::v1::VariadicSplit>(wgt_item,
-                                                                          split_dim_node,
-                                                                          split_length);
-
-            wgt_node_vec = split_wgts->outputs();
-        }
-
-        // create fc Nodes according to the splited weight or splited pattern.
-        std::vector<std::shared_ptr<Node>> fc_node_vec(split_num);
-        for (int i = 0; i < split_num; ++i) {
-            fc_node_vec[i] = fc_node->clone_with_new_inputs(ov::OutputVector{src_item, wgt_node_vec[i]});
-            fc_node_vec[i]->get_rt_info()["parallelDomain"] = fc_node->get_name();
-        }
-
-        // concat all small fc for result.
-        ov::NodeVector concat_args(std::move(fc_node_vec));
-        // concat happens on the latest dimension.
-        constexpr size_t concat_dim = -1;
-        auto concat_node = std::make_shared<ov::op::v0::Concat>(concat_args, concat_dim);
-
-        // check the shape after transformation.
-        const auto& out_shape = fc_node->get_output_partial_shape(0);
-        const auto& concat_shape = concat_node->get_output_partial_shape(0);
-        if (concat_shape != out_shape) {
-            return false;
-        }
-        ov::replace_node_update_name(fc_node, concat_node);
-        return true;
-    };
-
-    auto m = std::make_shared<ov::pass::pattern::Matcher>(fc_m, matcher_name);
-    this->register_matcher(m, callback);
-}
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.hpp
deleted file mode 100644
index f8434770b278ef..00000000000000
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/split_fc.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "openvino/pass/graph_rewrite.hpp"
-
-namespace ov {
-namespace intel_cpu {
-
-/*
- * Description:
- *      SplitFC detects FC CPU operation with and without compressed weights.
- *      And then splits the FC into several small FCs by output channel according to sub stream number.
- *      The goal is that the executor can dispatch the split FCs to different numa nodes in the system.
- *      As a result, the split FCs can be executed at the parallel level.
- *
- * Before:
- *
- *             +-------+                         +-------+
- *             |   X   |                         |   W   |
- *             |       |                         |       |
- *             |       |                         |       |
- *             +-------+                         +-------+
- *                 |                                 |
- *                 |                                 |
- * +---------------v---------------------------------v--------------+
- * |                                                                |
- * |                        FullyConnected                          |
- * |                                                                |
- * +------------------------------+---------------------------------+
- *                                |
- *                                | Output
- *                                v
- *
- * After:
- *
- *            +-------+                           +-------+
- *            |   X   |                           |   W   |
- *            |       |                           |       |
- *            |       |                           |       |
- *            +---+---+                           +---+---+
- *                |                                   |
- *                |                                   |
- *                |                           +-------v-------+
- *                |                           |               |
- *                |                           | VariadicSplit |
- *                |                           |               |
- *                |                           +--+---------+--+
- *                |                              |         |
- *                |     +------------------------+         |
- *                |     |                                  |
- *            +---------|------------------------+         |
- *            |         |                        |         |
- * +----------v---------v---------+  +-----------v---------v--------+
- * |                              |  |                              |
- * |        FullyConnected        |  |        FullyConnected        |
- * |                              |  |                              |
- * +--------------+---------------+  +--------------+---------------+
- *                |                                 |
- *                | Output                          | Output
- *                |                                 |
- * +--------------v---------------------------------v---------------+
- * |                                                                |
- * |                            Concat                              |
- * |                                                                |
- * +-------------------------------+--------------------------------+
- *                                 |
- *                                 |
- *                                 v
- */
-
-class SplitFC: public ov::pass::MatcherPass {
-public:
-    OPENVINO_RTTI("SplitFC", "0");
-    SplitFC(int sub_stream_num);
-};
-
-}   // namespace intel_cpu
-}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
index 20502f67d3645e..87fa1291bb7141 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
@@ -2,36 +2,67 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "openvino/core/type/element_type.hpp"
 #include "openvino/pass/constant_folding.hpp"
-#include "openvino/op/fake_quantize.hpp"
 #include "openvino/pass/manager.hpp"
 #include "common/pass/align_matmul_input_ranks.hpp"
-#include "transformations/common_optimizations/reshape_prelu.hpp"
-#include "common/pass/convert_broadcast_to_tiles.hpp"
+#include "transformations/common_optimizations/nop_elimination.hpp"
 #include "common/pass/convert_tile_to_seq_tiles.hpp"
 #include "common/pass/convert_matmul_to_fc.hpp"
 #include "common/pass/convert_to_power_static.hpp"
 #include "common/pass/convert_to_leaky_relu.hpp"
 #include "common/pass/convert_to_swish_cpu.hpp"
 #include "common/pass/move_fc_reshape_to_weights.hpp"
-#include "common/pass/split_fc.hpp"
+#include "common/pass/fc_bias_fusion.hpp"
 #include "transformations/convert_precision.hpp"
-#include "transformations/utils/utils.hpp"
+#include "transformations/op_conversions/convert_fc_to_compressed.hpp"
+#include "transformations/op_conversions/convert_fc_to_quantized_legacy.hpp"
 #include "common/pass/rnn_sequences_optimization.hpp"
 #include "transformations/common_optimizations/reshape_sequence_fusion.hpp"
 #include "transformations/defs.hpp"
+#include "config.h"
+#include "nodes/fullyconnected.h"
 
 #include "itt.hpp"
 
 namespace ov {
 namespace intel_cpu {
 
-inline void ConvertToCPUSpecificOpset(std::shared_ptr<ov::Model> &model) {
+inline void ConvertToCPUSpecificOpset(std::shared_ptr<ov::Model> &model, const Config& config) {
     RUN_ON_FUNCTION_SCOPE(ConvertToCPUSpecificOpset);
 
     ov::pass::Manager manager("CPU:ConvertToCPUSpecificOpset");
     manager.set_per_pass_validation(false);
+
     CPU_REGISTER_PASS_COMMON(manager, ConvertMatMulToFC);
+    CPU_REGISTER_PASS_COMMON(manager, FullyConnectedBiasFusion);
+
+    std::vector<ov::element::Type> supported_activation_types {
+        // @todo enable for bf16 as well
+        // after EnforceInferencePrecision is replaced with ConvertPrecision
+        ov::element::f32,
+    };
+
+    std::vector<ov::element::Type> supported_compressed_weights_types {
+        ov::element::u8,
+        ov::element::i8,
+        ov::element::u4,
+        ov::element::i4,
+        ov::element::nf4,
+        ov::element::f4e2m1,
+    };
+
+    CPU_REGISTER_PASS_X64(
+        manager,
+        pass::ConvertFullyConnectedToFullyConnectedCompressed,
+        supported_activation_types,
+        supported_compressed_weights_types,
+        [&config](const std::shared_ptr<ov::op::internal::FullyConnected>& fc, size_t IC, size_t OC, size_t G) {
+            return ov::intel_cpu::node::FullyConnected::isSupportedCompressedOperation(
+                fc, IC, OC, G, config.inferencePrecision);
+        });
+
+    CPU_REGISTER_PASS_X64(manager, pass::ConvertFCToFCQuantizedLegacy);
     CPU_REGISTER_PASS_X64(manager, MoveFCReshapeToWeights);
     CPU_REGISTER_PASS_X64(manager, ov::pass::Validate);
     CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks);
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 27afb95a73a1e9..f9fa372030e4cc 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -21,6 +21,7 @@
 // Common transformations
 #include "transformations/common_optimizations/mark_precision_sensitive_shapeof_subgraphs.hpp"
 #include "transformations/common_optimizations/add_fake_quantize_fusion.hpp"
+#include "transformations/common_optimizations/reshape_prelu.hpp"
 #include "transformations/fp16_compression/convert_compression_only_to_legacy.hpp"
 #include "transformations/common_optimizations/convert_quantize_dequantize.hpp"
 #include "transformations/common_optimizations/lstm_cell_fusion.hpp"
@@ -319,7 +320,7 @@ void Transformations::UpToLpt() {
 void Transformations::CpuSpecificOpSet(void) {
     CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Specific);
 
-    ConvertToCPUSpecificOpset(model);
+    ConvertToCPUSpecificOpset(model, config);
 }
 
 void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecisions) {
diff --git a/src/plugins/intel_cpu/src/transformations/utils.cpp b/src/plugins/intel_cpu/src/transformations/utils.cpp
index 3aa74f9ed9a970..63871868713e02 100644
--- a/src/plugins/intel_cpu/src/transformations/utils.cpp
+++ b/src/plugins/intel_cpu/src/transformations/utils.cpp
@@ -4,7 +4,7 @@
 
 #include "utils.hpp"
 #include "openvino/opsets/opset1.hpp"
-#include "cpu_opset/common/op/fully_connected.hpp"
+#include "ov_ops/fully_connected.hpp"
 #include "transformations/rt_info/dequantization_node.hpp"
 #include "transformations/utils/utils.hpp"
 
@@ -21,7 +21,7 @@ bool has_matmul_with_compressed_weights(const std::shared_ptr<const ov::Model>&
     };
 
     for (const auto& op : model->get_ops()) {
-        if (!ov::is_type<ov::op::v0::MatMul>(op) && !ov::is_type<FullyConnectedNode>(op))
+        if (!ov::is_type<ov::op::v0::MatMul>(op) && !ov::is_type<ov::op::internal::FullyConnected>(op))
             continue;
 
         if (!op->get_input_element_type(0).is_real())
diff --git a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp
index b6bd36205f985d..8ae9aa67edf9a7 100644
--- a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp
+++ b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp
@@ -9,6 +9,7 @@
 #include <vector>
 
 #include "general_utils.h"
+#include "openvino/core/except.hpp"
 #include "precision_support.h"
 
 namespace ov {
@@ -156,5 +157,35 @@ inline std::vector<float> makeAlignedBuffer(size_t targetSize, const std::vector
     }
     return alignedBuffer;
 }
+
+/**
+* @brief Reshape a tensor down to a specific rank
+*
+* Examples:
+* - reshapeToRank<2>({1, 2, 3, 4, 5}) == {1*2*3*4, 5}   == {24, 5}
+* - reshapeToRank<4>({1, 2, 3, 4, 5}) == {1*2, 3, 4, 5} == {2, 3, 4, 5}
+*/
+template <typename T>
+std::vector<T> reshapeDownToRank(const std::vector<T>& dims, size_t rank) {
+    OPENVINO_ASSERT(rank > 0, "Rank greater than zero is expected");
+
+    if (dims.size() <= rank) {
+        return dims;
+    }
+
+    const auto accEnd = dims.begin() + (dims.size() - rank + 1);
+    const auto acc = std::accumulate(dims.begin(), accEnd, (T)1, std::multiplies<T>());
+
+    std::vector<T> result{acc};
+    result.insert(result.end(), accEnd, dims.end());
+
+    return result;
+}
+
+template <size_t rank, typename T>
+std::vector<T> reshapeDownToRank(const std::vector<T>& dims) {
+    return reshapeDownToRank(dims, rank);
+}
+
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
index fcc983d84166c5..195d46c70e1c7c 100644
--- a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
+++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
@@ -2,6 +2,7 @@
 // Copyright (C) 2018-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+#include "openvino/core/type/element_type.hpp"
 #ifdef CPU_DEBUG_CAPS
 
 #include "cpu_memory.h"
@@ -310,7 +311,7 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) {
             void * data = pmem->getData();
             auto shape = pmem->getDesc().getShape().getDims();
 
-            if (shape_size(shape) <= 8) {
+            if (shape_size(shape) <= 8 && pmem->getDesc().getPrecision() != ov::element::undefined) {
                 auto type = pmem->getDesc().getPrecision();
                 auto tensor = ov::Tensor(type, shape, data);
                 auto constop = std::make_shared<ov::op::v0::Constant>(tensor);
@@ -663,7 +664,7 @@ std::ostream& operator<<(std::ostream& os, const IMemory& mem) {
     }
     return os;
 }
-// @todo remove
+
 void print_dnnl_memory(const dnnl::memory& memory, const size_t size, const int id, const char* message) {
     const size_t s = memory.get_desc().get_size() / sizeof(float);
     std::cout << message << " " << id << " size: " << s << ", values: ";
diff --git a/src/plugins/intel_cpu/src/utils/debug_capabilities.h b/src/plugins/intel_cpu/src/utils/debug_capabilities.h
index 7a1158d259a4a3..2646ba817dca9c 100644
--- a/src/plugins/intel_cpu/src/utils/debug_capabilities.h
+++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.h
@@ -3,6 +3,7 @@
 //
 #pragma once
 
+#include "cpu_types.h"
 #include "openvino/util/env_util.hpp"
 #ifdef CPU_DEBUG_CAPS
 
@@ -94,6 +95,12 @@ class PrintableTimer {
     }
 };
 
+template<typename T>
+std::ostream & operator<<(std::ostream & os, const std::vector<T> vec) {
+    for (const auto& element : vec)
+        os << element << "x";
+    return os;
+}
 std::ostream & operator<<(std::ostream & os, const PortConfig& desc);
 std::ostream & operator<<(std::ostream & os, const NodeConfig& desc);
 std::ostream & operator<<(std::ostream & os, const NodeDesc& desc);
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/matmul.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/matmul.cpp
index 6d827614f80c54..4afdd90427b06e 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/matmul.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/matmul.cpp
@@ -23,7 +23,6 @@ static const std::vector<CPUSpecificParams>& filterSpecificParamsFC() {
 std::vector<fusingSpecificParams> fusingParamsSet2D_smoke {
     emptyFusingSpec,
     fusingBias,
-    fusingMultiplyPerChannel,
     fusingRelu,
     fusingTanh
 };
@@ -62,7 +61,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_f16, MatMulLayerCPUTest, testParams2D_smoke
 std::vector<fusingSpecificParams> fusingParamsSet3D_smoke {
     emptyFusingSpec,
     fusingBias,
-    fusingMultiplyPerChannel,
     fusingRelu,
     fusingTanh
 };
@@ -106,7 +104,6 @@ const std::vector<ShapeRelatedParams> IS = {
 
 std::vector<fusingSpecificParams> fusingParamsSet4D_smoke {
         emptyFusingSpec,
-        fusingMultiplyPerChannel,
         fusingRelu,
         fusingTanh
 };
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp
index 3643427de3e9b7..9a434943893eed 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp
@@ -87,7 +87,7 @@ class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeig
 
         result << "config=(";
         for (const auto& configEntry : additional_config) {
-            result << configEntry.first << ", " << configEntry.second.as<std::string>() << ":";
+            result << configEntry.first << ", " << configEntry.second.as<std::string>() << "_";
         }
         result << ")";
         result << CpuTestWithFusing::getTestCaseName(fusing_params);
diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/fullconnect.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/fullconnect.cpp
index a5b01a2c3c2f9c..90a2fc9d0b9768 100644
--- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/fullconnect.cpp
+++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/fullconnect.cpp
@@ -4,9 +4,11 @@
 
 #include <gtest/gtest.h>
 
-#include "openvino/op/parameter.hpp"
-#include "transformations/cpu_opset/common/op/fully_connected.hpp"
 #include "custom_shape_infer.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/parameter.hpp"
+#include "ov_ops/fully_connected.hpp"
+
 namespace ov {
 namespace intel_cpu {
 namespace unit_test {
@@ -16,16 +18,66 @@ using namespace ov;
 using namespace ov::intel_cpu;
 
 TEST(CpuShapeInfer, FC_InputSize_2) {
-    auto activate = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{-1, -1 });
+    auto activate = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{-1, -1});
     auto weight = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{5, 6});
-    auto op = std::make_shared<ov::intel_cpu::FullyConnectedNode>(activate, weight, ov::Rank(5), element::f32);
+    auto op = std::make_shared<ov::op::internal::FullyConnected>(
+        activate,
+        weight,
+        std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
     std::vector<StaticShape> static_input_shapes = {StaticShape{720, 640}, {5, 6}};
-    std::vector<StaticShape> static_output_shapes = {StaticShape{1, 1, 1, 720, 5}};
+    std::vector<StaticShape> static_output_shapes = {StaticShape{720, 5}};
+    unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes);
+}
+
+TEST(CpuShapeInfer, FC_broadcastWeights1) {
+    auto activate = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{1, -1, -1});
+    auto weight = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{5, 6});
+    auto op = std::make_shared<ov::op::internal::FullyConnected>(
+        activate,
+        weight,
+        std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
+    std::vector<StaticShape> static_input_shapes = {StaticShape{1, 720, 6}, {5, 6}};
+    std::vector<StaticShape> static_output_shapes = {StaticShape{1, 720, 5}};
+    unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes);
+}
+
+TEST(CpuShapeInfer, FC_broadcastWeights2) {
+    auto activate = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{-1, -1, -1, -1});
+    auto weight = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{5, 6});
+    auto op = std::make_shared<ov::op::internal::FullyConnected>(
+        activate,
+        weight,
+        std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
+    std::vector<StaticShape> static_input_shapes = {StaticShape{2, 3, 720, 6}, {5, 6}};
+    std::vector<StaticShape> static_output_shapes = {StaticShape{2, 3, 720, 5}};
+    unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes);
+}
+
+TEST(CpuShapeInfer, FC_broadcastActivations1) {
+    auto activate = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{720, -1});
+    auto weight = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{1, 5, 6});
+    auto op = std::make_shared<ov::op::internal::FullyConnected>(
+        activate,
+        weight,
+        std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
+    std::vector<StaticShape> static_input_shapes = {StaticShape{720, 6}, {1, 5, 6}};
+    std::vector<StaticShape> static_output_shapes = {StaticShape{1, 720, 5}};
     unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes);
 }
 
-} // namespace cpu_shape_infer
-} // namespace unit_test
-} // namespace intel_cpu
-} // namespace ov
+TEST(CpuShapeInfer, FC_broadcastActivations2) {
+    auto activate = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{-1, -1});
+    auto weight = std::make_shared<ov::op::v0::Parameter>(element::f32, PartialShape{1, 1, 5, 6});
+    auto op = std::make_shared<ov::op::internal::FullyConnected>(
+        activate,
+        weight,
+        std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
+    std::vector<StaticShape> static_input_shapes = {StaticShape{720, 6}, {1, 1, 5, 6}};
+    std::vector<StaticShape> static_output_shapes = {StaticShape{1, 1, 720, 5}};
+    unit_test::cpu_test_shape_infer(op.get(), static_input_shapes, static_output_shapes);
+}
 
+}  // namespace cpu_shape_infer
+}  // namespace unit_test
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp b/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp
index cb085920d97dc5..37df1fd6d27910 100644
--- a/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp
@@ -4,21 +4,20 @@
 
 #include <gtest/gtest.h>
 
-#include <string>
 #include <memory>
-
 #include <openvino/core/model.hpp>
 #include <openvino/opsets/opset1.hpp>
 #include <openvino/opsets/opset3.hpp>
 #include <openvino/opsets/opset7.hpp>
-#include <transformations/cpu_opset/common/op/fully_connected.hpp>
+#include <openvino/pass/manager.hpp>
+#include <ov_ops/type_relaxed.hpp>
 #include <transformations/cpu_opset/common/pass/convert_matmul_to_fc.hpp>
 #include <transformations/init_node_info.hpp>
 #include <transformations/utils/utils.hpp>
-#include <openvino/pass/manager.hpp>
-#include <ov_ops/type_relaxed.hpp>
 
 #include "common_test_utils/ov_test_utils.hpp"
+#include "openvino/op/constant.hpp"
+#include "ov_ops/fully_connected.hpp"
 #include "transformations/rt_info/decompression.hpp"
 
 using namespace testing;
@@ -26,25 +25,28 @@ using namespace ov::intel_cpu;
 
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest1) {
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 2, 2 });
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 1, 2, 2 }, { 1 });
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 2});
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 2}, {1});
         auto matmul = std::make_shared<ov::opset1::MatMul>(input1, input2, true, false);
 
-        model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
         manager.register_pass<ConvertMatMulToFC>();
     }
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 2, 2 });
-        auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 });
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 2});
+        auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1});
         auto transpose1 = std::make_shared<ov::opset1::Transpose>(input1, transpose_constant1);
 
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 2 }, { 1 });
-        auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 2}, {1});
+        auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1});
         auto transpose2 = std::make_shared<ov::opset1::Transpose>(input2, transpose_constant2);
 
-        auto matmul = std::make_shared<FullyConnectedNode>(transpose1, transpose2, ov::Rank(3));
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            transpose1,
+            transpose2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
 }
 
@@ -78,7 +80,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest3) {
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 2});
         auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(3));
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
@@ -96,27 +101,30 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest4) {
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{-1, -1, 2});
         auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(3));
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
 }
 
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest5) {
-    auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{ -1, -1, 2 });
-    auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 3, 2, 2 }, { 1 });
+    auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{-1, -1, 2});
+    auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 2, 2}, {1});
     auto matmul = std::make_shared<ov::opset1::MatMul>(input1, input2, false, true);
 
-    model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+    model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     manager.register_pass<ConvertMatMulToFC>();
 }
 
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest6) {
-    auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{ -1, -1, 2 });
-    auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 3, 1, 2 }, { 1 });
+    auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{-1, -1, 2});
+    auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 1, 2}, {1});
     auto matmul = std::make_shared<ov::opset1::MatMul>(input1, input2, false, true);
 
-    model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+    model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     manager.register_pass<ConvertMatMulToFC>();
 }
 
@@ -132,7 +140,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest7) {
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 2});
         auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 2}, {1});
-        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(2));
+        auto fc = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{input1});
     }
@@ -151,11 +162,14 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest8) {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{-1, -1, 2});
         auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 2}, {1});
 
-        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(2));
+        auto fc = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
         auto a_shape = std::make_shared<ov::opset3::ShapeOf>(input1);
 
         auto I = ov::op::util::node_to_get_shape_value_of_indices_from_shape_node(a_shape, {0, 1});
-        auto O = ov::opset1::Constant::create(ov::element::i64, { 1 }, { 3 });
+        auto O = ov::opset1::Constant::create(ov::element::i64, {1}, {3});
         auto output_shape = std::make_shared<ov::opset1::Concat>(ov::OutputVector{I, O}, 0);
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{input1});
@@ -174,7 +188,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest9) {
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 2});
         auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(3));
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
@@ -182,10 +199,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest9) {
 
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest10) {
     auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape::dynamic());
-    auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 2 }, { 1 });
+    auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1});
     auto matmul = std::make_shared<ov::opset1::MatMul>(input1, input2, false, true);
 
-    model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+    model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     manager.register_pass<ConvertMatMulToFC>();
 }
 
@@ -218,8 +235,11 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest13) {
     }
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{-1, -1, 1});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{80, 1}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(3));
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 80, 1}, {1});
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
@@ -242,8 +262,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest14) {
     }
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::u8, ov::PartialShape{-1, -1, 1});
-        auto input2 = ov::opset1::Constant::create(ov::element::i8, ov::Shape{80, 1}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(3), ov::element::f32);
+        auto input2 = ov::opset1::Constant::create(ov::element::i8, ov::Shape{1, 80, 1}, {1});
+
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}),
+            ov::element::f32);
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
@@ -252,7 +277,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest14) {
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_1) {
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{2, 3, 4, 5});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 6, 5 }, { 1 });
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{6, 5}, {1});
         auto matmul = std::make_shared<ov::opset1::MatMul>(input1, input2, false, true);
 
         model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
@@ -260,8 +285,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_1) {
     }
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{2, 3, 4, 5});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 6, 5 }, { 1 });
-        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(4), ov::element::f32);
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{6, 5}, {1});
+
+        auto fc = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}),
+            ov::element::f32);
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{input1});
     }
@@ -278,8 +308,11 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_2) {
     }
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{-1, -1, 1, 5});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{10, 5}, {1});
-        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(4));
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 10, 5}, {1});
+        auto fc = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{input1});
     }
@@ -288,7 +321,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_2) {
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_3) {
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{2, 4});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, { 1 });
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1});
         auto matmul = std::make_shared<ov::opset1::MatMul>(input1, input2, false, true);
 
         model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
@@ -296,8 +329,12 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_3) {
     }
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{2, 4});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{5, 4}, { 1 });
-        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(4), ov::element::f32);
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1});
+        auto fc = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}),
+            ov::element::f32);
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{input1});
     }
@@ -306,7 +343,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_3) {
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_4) {
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 4});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, { 1 });
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1});
         auto matmul = std::make_shared<ov::opset1::MatMul>(input1, input2, false, true);
 
         model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
@@ -314,8 +351,12 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_4) {
     }
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 4});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{5, 4}, { 1 });
-        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(4), ov::element::f32);
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1});
+        auto fc = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}),
+            ov::element::f32);
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{input1});
     }
@@ -324,7 +365,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_4) {
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_5) {
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{2, 3, 2, 4});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, { 1 });
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1});
         auto matmul = std::make_shared<ov::opset1::MatMul>(input1, input2, false, true);
 
         model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
@@ -332,8 +373,12 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_5) {
     }
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{2, 3, 2, 4});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{5, 4}, { 1 });
-        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(4), ov::element::f32);
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, {1});
+        auto fc = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}),
+            ov::element::f32);
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{input1});
     }
@@ -350,97 +395,112 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_second_input_rank_adj_1) {
     }
     {
         auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{5, 2, 3});
-        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 3}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(input1, input2, ov::Rank(2));
+        auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 3}, {1});
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            input2,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
 }
 
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest_second_input_rank_adj_2) {
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 2, 3 });
-        auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 3 }, { 1 });
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{2, 3});
+        auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 3}, {1});
         auto matmul = std::make_shared<ov::opset1::MatMul>(input1, weights, false, true);
 
-        model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
         manager.register_pass<ConvertMatMulToFC>();
     }
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 2, 3 });
-        auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 3 }, { 1 });
-        auto matmul = std::make_shared<FullyConnectedNode>(input1, weights, ov::Rank(2));
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{2, 3});
+        auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 3}, {1});
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            weights,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
 }
 
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest_second_input_rank_adj_3) {
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 5, 2, 3 });
-        auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 1, 2, 3 }, { 1 });
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{5, 2, 3});
+        auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 3}, {1});
         auto matmul = std::make_shared<ov::opset1::MatMul>(input1, weights, false, true);
 
-        model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
         manager.register_pass<ConvertMatMulToFC>();
     }
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 5, 2, 3 });
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{5, 2, 3});
 
-        auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 3 }, { 1 });
-        auto matmul = std::make_shared<FullyConnectedNode>(input1, weights,  ov::Rank(3));
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 3}, {1});
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            weights,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
 }
 
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest_decompress_convert_0) {
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 2, 2 });
-        auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 1, 2, 2 }, { 1 });
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 2});
+        auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1});
         auto convert = std::make_shared<ov::opset1::Convert>(input2, ov::element::f32);
         ov::mark_as_decompression(convert);
         auto matmul = std::make_shared<ov::opset1::MatMul>(input1, convert, false, false);
 
-        model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
         manager.register_pass<ConvertMatMulToFC>();
     }
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 2, 2 });
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 2});
 
-        auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 2, 2 }, { 1 });
-        auto transpose_constant = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
+        auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1});
+        auto transpose_constant = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1});
         auto transpose = std::make_shared<ov::opset1::Transpose>(input2, transpose_constant);
         auto convert = std::make_shared<ov::opset1::Convert>(transpose, ov::element::f32);
 
-        auto matmul = std::make_shared<FullyConnectedNode>(input1, convert, ov::Rank(3));
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            input1,
+            convert,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
 }
 
 TEST_F(TransformationTestsF, ConvertMatMulToFCTest_decompress_convert_1) {
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 2, 2 });
-        auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 1, 2, 2 }, { 1 });
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 2});
+        auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1});
         auto convert = std::make_shared<ov::opset1::Convert>(input2, ov::element::f32);
         ov::mark_as_decompression(convert);
         auto matmul = std::make_shared<ov::opset1::MatMul>(input1, convert, true, false);
 
-        model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
         manager.register_pass<ConvertMatMulToFC>();
     }
     {
-        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 2, 2 });
-        auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 });
+        auto input1 = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 2, 2});
+        auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1});
         auto transpose1 = std::make_shared<ov::opset1::Transpose>(input1, transpose_constant1);
 
-        auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 2, 2 }, { 1 });
-        auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
+        auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1});
+        auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1});
         auto transpose2 = std::make_shared<ov::opset1::Transpose>(input2, transpose_constant2);
         auto convert = std::make_shared<ov::opset1::Convert>(transpose2, ov::element::f32);
 
-        auto matmul = std::make_shared<FullyConnectedNode>(transpose1, convert, ov::Rank(3));
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            transpose1,
+            convert,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 });
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{input1});
     }
 }
 
@@ -467,12 +527,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_compressed_u8_weights) {
         auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 2}, {1});
         auto mul = std::make_shared<ov::opset1::Multiply>(sub, mul_const);
 
-        auto reshape_const = ov::opset1::Constant::create(ov::element::i32, {2}, {2, -1});
-        auto reshape = std::make_shared<ov::opset1::Reshape>(mul, reshape_const, false);
-        auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {2}, {1, 0});
-        auto transpose = std::make_shared<ov::opset1::Transpose>(reshape, transpose_const);
-        auto matmul = std::make_shared<FullyConnectedNode>(data, transpose, ov::Rank(3));
+        auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {3}, {0, 2, 1});
+        auto transpose = std::make_shared<ov::opset1::Transpose>(mul, transpose_const);
+        auto matmul = std::make_shared<ov::op::internal::FullyConnected>(
+            data,
+            transpose,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
 
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ data });
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{data});
     }
 }
diff --git a/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp b/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp
index 68241c9169bce7..b3d733aecba27b 100644
--- a/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp
+++ b/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp
@@ -11,7 +11,7 @@
 
 #include <openvino/core/model.hpp>
 #include <openvino/opsets/opset1.hpp>
-#include <transformations/cpu_opset/common/op/fully_connected.hpp>
+#include "ov_ops/fully_connected.hpp"
 #include <transformations/init_node_info.hpp>
 #include <transformations/utils/utils.hpp>
 
@@ -115,7 +115,12 @@ class MoveFCReshapeToWeightsTests : public TransformationTestsF, public WithPara
             auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {2}, {1, 0});
             weights_path = std::make_shared<ov::opset1::Transpose>(weights_path, transpose_const);
         }
-        auto fully_connected = std::make_shared<FullyConnectedNode>(data, weights_path, ov::Rank(3));
+
+        auto fully_connected = std::make_shared<ov::op::internal::FullyConnected>(
+            data,
+            weights_path,
+            std::make_shared<ov::op::v0::Constant>(ov::element::undefined, ov::Shape{0}));
+
         return std::make_shared<ov::Model>(ov::NodeVector{fully_connected}, ov::ParameterVector{data});
     }
 
diff --git a/src/plugins/intel_cpu/tests/unit/transformations/split_fc_test.cpp b/src/plugins/intel_cpu/tests/unit/transformations/split_fc_test.cpp
deleted file mode 100644
index 4c955ec5286813..00000000000000
--- a/src/plugins/intel_cpu/tests/unit/transformations/split_fc_test.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include <gtest/gtest.h>
-
-#include <string>
-#include <memory>
-
-#include <openvino/core/model.hpp>
-#include <openvino/opsets/opset1.hpp>
-#include <transformations/cpu_opset/common/op/fully_connected.hpp>
-#include <transformations/cpu_opset/common/pass/split_fc.hpp>
-#include <transformations/init_node_info.hpp>
-#include <transformations/utils/utils.hpp>
-#include "openvino/core/visibility.hpp"
-#include <openvino/pass/manager.hpp>
-#include <ov_ops/type_relaxed.hpp>
-
-#include "common_test_utils/ov_test_utils.hpp"
-#include "transformations/rt_info/decompression.hpp"
-
-using namespace testing;
-using namespace ov::intel_cpu;
-
-#if defined (OPENVINO_ARCH_ARM) && defined(__linux__)
-// Ticket: 153166
-TEST_F(TransformationTestsF, DISABLED_SplitFCTest) {
-#else
-TEST_F(TransformationTestsF, SplitFCTest) {
-#endif
-    disable_rt_info_check();
-    {
-        auto src = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 4096, 1 });
-        auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 });
-        auto transpose_src = std::make_shared<ov::opset1::Transpose>(src, transpose_constant_src);
-
-        auto wgt = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2048, 4096 }, { 12.34 });
-
-        auto fc = std::make_shared<FullyConnectedNode>(transpose_src, wgt, ov::Rank(3));
-        model = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{src});
-        manager.register_pass<SplitFC>(1);
-    }
-    {
-        auto src = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 4096, 1 });
-        auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 });
-        auto transpose_src = std::make_shared<ov::opset1::Transpose>(src, transpose_constant_src);
-
-        auto wgt = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2048, 4096 }, { 12.34 });
-
-        auto split_dim_node = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 0);
-        auto split_length = ov::opset1::Constant::create<int32_t>(ov::element::i32, ov::Shape{2}, {1024, 1024});
-        auto split_wgts = std::make_shared<ov::opset1::VariadicSplit>(wgt, split_dim_node, split_length);
-
-        auto fc0 = std::make_shared<FullyConnectedNode>(transpose_src, split_wgts->output(0), ov::Rank(3));
-        auto fc1 = std::make_shared<FullyConnectedNode>(transpose_src, split_wgts->output(1), ov::Rank(3));
-
-        ov::NodeVector concat_args({fc0, fc1});
-        constexpr size_t concat_dim = -1;
-        auto concat = std::make_shared<ov::opset1::Concat>(concat_args, concat_dim);
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{concat}, ov::ParameterVector{src});
-    }
-}
-
-#if defined (OPENVINO_ARCH_ARM) && defined(__linux__)
-// Ticket: 153166
-TEST_F(TransformationTestsF, DISABLED_SplitFCTest_int8_weight) {
-#else
-TEST_F(TransformationTestsF, SplitFCTest_int8_weight) {
-#endif
-    disable_rt_info_check();
-    {
-        auto src = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 4096, 1});
-        auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1});
-        auto transpose_src = std::make_shared<ov::opset1::Transpose>(src, transpose_constant_src);
-
-        auto wgt = ov::opset1::Constant::create(ov::element::u8, ov::Shape{2048, 4096}, {123});
-        auto cvt_wgt = std::make_shared<ov::opset1::Convert>(wgt, ov::element::f32);
-
-        auto zp = ov::opset1::Constant::create(ov::element::u8, ov::Shape{2048, 1}, {1});
-        auto cvt_zp = std::make_shared<ov::opset1::Convert>(zp, ov::element::f32);
-
-        auto sub = std::make_shared<ov::opset1::Subtract>(cvt_wgt, cvt_zp);
-
-        auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2048, 1}, {0.2});
-        auto mul = std::make_shared<ov::opset1::Multiply>(sub, mul_const);
-
-        auto fc = std::make_shared<FullyConnectedNode>(transpose_src, mul, ov::Rank(3));
-        model = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{src});
-        manager.register_pass<SplitFC>(1);
-    }
-    {
-        auto src = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 4096, 1 });
-        auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 });
-        auto transpose_src = std::make_shared<ov::opset1::Transpose>(src, transpose_constant_src);
-
-        auto wgt = ov::opset1::Constant::create(ov::element::u8, ov::Shape{ 2048, 4096 }, { 123 });
-        auto cvt_wgt = std::make_shared<ov::opset1::Convert>(wgt, ov::element::f32);
-
-        auto split_dim_node = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 0);
-        auto split_length = ov::opset1::Constant::create<int32_t>(ov::element::i32, ov::Shape{2}, {1024, 1024});
-
-        auto split_wgts = std::make_shared<ov::opset1::VariadicSplit>(wgt, split_dim_node, split_length);
-        auto cvt_wgt0 = std::make_shared<ov::opset1::Convert>(split_wgts->output(0), ov::element::f32);
-        auto cvt_wgt1 = std::make_shared<ov::opset1::Convert>(split_wgts->output(1), ov::element::f32);
-
-        auto zp = ov::opset1::Constant::create(ov::element::u8, ov::Shape{2048, 1}, {1});
-        auto split_zp = std::make_shared<ov::opset1::VariadicSplit>(zp, split_dim_node, split_length);
-
-        auto cvt_zp0 = std::make_shared<ov::opset1::Convert>(split_zp->output(0), ov::element::f32);
-        auto cvt_zp1 = std::make_shared<ov::opset1::Convert>(split_zp->output(1), ov::element::f32);
-
-        auto sub0 = std::make_shared<ov::opset1::Subtract>(cvt_wgt0, cvt_zp0);
-        auto sub1 = std::make_shared<ov::opset1::Subtract>(cvt_wgt1, cvt_zp1);
-
-        auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2048, 1}, {0.2});
-        auto split_mul_const = std::make_shared<ov::opset1::VariadicSplit>(mul_const, split_dim_node, split_length);
-
-        auto mul0 = std::make_shared<ov::opset1::Multiply>(sub0, split_mul_const->output(0));
-        auto mul1 = std::make_shared<ov::opset1::Multiply>(sub1, split_mul_const->output(1));
-
-        auto fc0 = std::make_shared<FullyConnectedNode>(transpose_src, mul0, ov::Rank(3));
-        auto fc1 = std::make_shared<FullyConnectedNode>(transpose_src, mul1, ov::Rank(3));
-
-        ov::NodeVector concat_args({fc0, fc1});
-        constexpr size_t concat_dim = -1;
-        auto concat = std::make_shared<ov::opset1::Concat>(concat_args, concat_dim);
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{concat}, ov::ParameterVector{src});
-    }
-}
-
-#if defined (OPENVINO_ARCH_ARM) && defined(__linux__)
-// Ticket: 153166
-TEST_F(TransformationTestsF, DISABLED_SplitFCTest_int4_weight) {
-#else
-TEST_F(TransformationTestsF, SplitFCTest_int4_weight) {
-#endif
-    disable_rt_info_check();
-    {
-        auto src = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 4096, 1});
-        auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1});
-        auto transpose_src = std::make_shared<ov::opset1::Transpose>(src, transpose_constant_src);
-
-        auto wgt = ov::opset1::Constant::create(ov::element::u4, ov::Shape{2048, 4096}, {12});
-        auto cvt_wgt = std::make_shared<ov::opset1::Convert>(wgt, ov::element::f32);
-
-        auto zp = ov::opset1::Constant::create(ov::element::u4, ov::Shape{2048, 1}, {1});
-        auto cvt_zp = std::make_shared<ov::opset1::Convert>(zp, ov::element::f32);
-
-        auto sub = std::make_shared<ov::opset1::Subtract>(cvt_wgt, cvt_zp);
-
-        auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2048, 1}, {0.2});
-        auto mul = std::make_shared<ov::opset1::Multiply>(sub, mul_const);
-
-        auto fc = std::make_shared<FullyConnectedNode>(transpose_src, mul, ov::Rank(3));
-        model = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{src});
-        manager.register_pass<SplitFC>(1);
-    }
-    {
-        auto src = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{3, 4096, 1});
-        auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1});
-        auto transpose_src = std::make_shared<ov::opset1::Transpose>(src, transpose_constant_src);
-
-        auto wgt = ov::opset1::Constant::create(ov::element::u4, ov::Shape{2048, 4096}, {12});
-        auto cvt_wgt_i8 = std::make_shared<ov::opset1::Convert>(wgt, ov::element::i8);
-
-        auto split_dim_node = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 0);
-        auto split_length = ov::opset1::Constant::create<int32_t>(ov::element::i32, ov::Shape{2}, {1024, 1024});
-
-        auto split_wgts = std::make_shared<ov::opset1::VariadicSplit>(cvt_wgt_i8, split_dim_node, split_length);
-        auto cvt_wgt0_u4 = std::make_shared<ov::opset1::Convert>(split_wgts->output(0), ov::element::u4);
-        auto cvt_wgt1_u4 = std::make_shared<ov::opset1::Convert>(split_wgts->output(1), ov::element::u4);
-        auto cvt_wgt0_f32 = std::make_shared<ov::opset1::Convert>(cvt_wgt0_u4, ov::element::f32);
-        auto cvt_wgt1_f32 = std::make_shared<ov::opset1::Convert>(cvt_wgt1_u4, ov::element::f32);
-
-        auto zp = ov::opset1::Constant::create(ov::element::u4, ov::Shape{2048, 1}, {1});
-        auto cvt_zp_i8 = std::make_shared<ov::opset1::Convert>(zp, ov::element::i8);
-        auto split_zp = std::make_shared<ov::opset1::VariadicSplit>(cvt_zp_i8, split_dim_node, split_length);
-
-        auto cvt_zp0_u4 = std::make_shared<ov::opset1::Convert>(split_zp->output(0), ov::element::u4);
-        auto cvt_zp1_u4 = std::make_shared<ov::opset1::Convert>(split_zp->output(1), ov::element::u4);
-        auto cvt_zp0_f32 = std::make_shared<ov::opset1::Convert>(cvt_zp0_u4, ov::element::f32);
-        auto cvt_zp1_f32 = std::make_shared<ov::opset1::Convert>(cvt_zp1_u4, ov::element::f32);
-
-        auto sub0 = std::make_shared<ov::opset1::Subtract>(cvt_wgt0_f32, cvt_zp0_f32);
-        auto sub1 = std::make_shared<ov::opset1::Subtract>(cvt_wgt1_f32, cvt_zp1_f32);
-
-        auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2048, 1}, {0.2});
-        auto split_mul_const = std::make_shared<ov::opset1::VariadicSplit>(mul_const, split_dim_node, split_length);
-
-        auto mul0 = std::make_shared<ov::opset1::Multiply>(sub0, split_mul_const->output(0));
-        auto mul1 = std::make_shared<ov::opset1::Multiply>(sub1, split_mul_const->output(1));
-
-        auto fc0 = std::make_shared<FullyConnectedNode>(transpose_src, mul0, ov::Rank(3));
-        auto fc1 = std::make_shared<FullyConnectedNode>(transpose_src, mul1, ov::Rank(3));
-
-        ov::NodeVector concat_args({fc0, fc1});
-        constexpr size_t concat_dim = -1;
-        auto concat = std::make_shared<ov::opset1::Concat>(concat_args, concat_dim);
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{concat}, ov::ParameterVector{src});
-    }
-}
-
-#if (defined OPENVINO_ARCH_ARM && defined(__linux__))
-// Ticket: 153166
-TEST_F(TransformationTestsF, DISABLED_SplitFCTest_int4_weight_reshape) {
-#else
-TEST_F(TransformationTestsF, SplitFCTest_int4_weight_reshape) {
-#endif
-    disable_rt_info_check();
-    {
-        auto src = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 2048, 1 });
-        auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 });
-        auto transpose_src = std::make_shared<ov::opset1::Transpose>(src, transpose_constant_src);
-
-        auto wgt = ov::opset1::Constant::create(ov::element::u4, ov::Shape{ 4096, 2, 1024}, { 12 });
-        auto cvt_wgt = std::make_shared<ov::opset1::Convert>(wgt, ov::element::f32);
-
-        auto zp = ov::opset1::Constant::create(ov::element::u4, ov::Shape{1}, { 1 });
-        auto cvt_zp = std::make_shared<ov::opset1::Convert>(zp, ov::element::f32);
-
-        auto sub = std::make_shared<ov::opset1::Subtract>(cvt_wgt, cvt_zp);
-
-        auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{4096, 2, 1}, {0.2});
-        auto mul = std::make_shared<ov::opset1::Multiply>(sub, mul_const);
-
-        auto res_const = ov::opset1::Constant::create(ov::element::i32, ov::Shape{2}, {4096, 2048});
-        auto reshape = std::make_shared<ov::opset1::Reshape>(mul, res_const, false);
-
-        auto fc = std::make_shared<FullyConnectedNode>(transpose_src, reshape, ov::Rank(3));
-        model = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{src});
-        manager.register_pass<SplitFC>(1);
-    }
-    {
-        auto src = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{ 3, 2048, 1 });
-        auto transpose_constant_src = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 });
-        auto transpose_src = std::make_shared<ov::opset1::Transpose>(src, transpose_constant_src);
-
-        auto wgt = ov::opset1::Constant::create(ov::element::u4, ov::Shape{ 4096, 2, 1024 }, { 12 });
-        auto cvt_wgt_i8 = std::make_shared<ov::opset1::Convert>(wgt, ov::element::i8);
-
-        auto split_dim_node = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 0);
-        auto split_length = ov::opset1::Constant::create<int32_t>(ov::element::i32, ov::Shape{2}, {2048, 2048});
-
-        auto split_wgts = std::make_shared<ov::opset1::VariadicSplit>(cvt_wgt_i8, split_dim_node, split_length);
-        auto cvt_wgt0_u4 = std::make_shared<ov::opset1::Convert>(split_wgts->output(0), ov::element::u4);
-        auto cvt_wgt1_u4 = std::make_shared<ov::opset1::Convert>(split_wgts->output(1), ov::element::u4);
-        auto cvt_wgt0_f32 = std::make_shared<ov::opset1::Convert>(cvt_wgt0_u4, ov::element::f32);
-        auto cvt_wgt1_f32 = std::make_shared<ov::opset1::Convert>(cvt_wgt1_u4, ov::element::f32);
-
-        auto zp = ov::opset1::Constant::create(ov::element::u4, ov::Shape{1}, { 1 });
-        auto zp0 = std::make_shared<ov::opset1::Constant>(zp->get_element_type(), zp->get_shape(), zp->get_data_ptr());
-        auto zp1 = std::make_shared<ov::opset1::Constant>(zp->get_element_type(), zp->get_shape(), zp->get_data_ptr());
-
-        auto cvt_zp0 = std::make_shared<ov::opset1::Convert>(zp0, ov::element::f32);
-        auto cvt_zp1 = std::make_shared<ov::opset1::Convert>(zp1, ov::element::f32);
-
-        auto sub0 = std::make_shared<ov::opset1::Subtract>(cvt_wgt0_f32, cvt_zp0);
-        auto sub1 = std::make_shared<ov::opset1::Subtract>(cvt_wgt1_f32, cvt_zp1);
-
-        auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{4096, 2, 1}, {0.2});
-        auto split_mul_const = std::make_shared<ov::opset1::VariadicSplit>(mul_const, split_dim_node, split_length);
-
-        auto mul0 = std::make_shared<ov::opset1::Multiply>(sub0, split_mul_const->output(0));
-        auto mul1 = std::make_shared<ov::opset1::Multiply>(sub1, split_mul_const->output(1));
-
-        std::vector<int32_t> reshape_pattern_vec = {2048, 2048};
-        auto reshape_pattern = std::make_shared<ov::opset1::Constant>(ov::element::i32, ov::Shape{2}, reshape_pattern_vec);
-        auto reshape0 = std::make_shared<ov::opset1::Reshape>(mul0, reshape_pattern, false);
-        auto reshape1 = std::make_shared<ov::opset1::Reshape>(mul1, reshape_pattern, false);
-
-        auto fc0 = std::make_shared<FullyConnectedNode>(transpose_src, reshape0, ov::Rank(3));
-        auto fc1 = std::make_shared<FullyConnectedNode>(transpose_src, reshape1, ov::Rank(3));
-
-        ov::NodeVector concat_args({fc0, fc1});
-        constexpr size_t concat_dim = -1;
-        auto concat = std::make_shared<ov::opset1::Concat>(concat_args, concat_dim);
-        model_ref = std::make_shared<ov::Model>(ov::NodeVector{concat}, ov::ParameterVector{src});
-    }
-}
diff --git a/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp b/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp
index 8ca920d421040f..d781d92b57052a 100644
--- a/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp
+++ b/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp
@@ -88,6 +88,7 @@ void TransformationTestsF::TearDown() {
         ASSERT_TRUE(res.valid) << res.message;
         comparator.disable(FunctionsComparator::CmpValues::ACCURACY);
     }
+
     auto res = comparator.compare(model, model_ref);
     ASSERT_TRUE(res.valid) << res.message;
 }

From 0762993323c509eeffd2cae48492607dac936903 Mon Sep 17 00:00:00 2001
From: Andrzej Kopytko <andrzejx.kopytko@intel.com>
Date: Tue, 10 Dec 2024 08:41:52 +0100
Subject: [PATCH 22/23] Docs Port for sitemap update to master (#27977)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 .../openvino_custom_sphinx_sitemap/__init__.py                | 2 +-
 docs/sphinx_setup/_static/js/custom.js                        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py b/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py
index 6bdd3288f8069c..c578b82c360a53 100644
--- a/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py
+++ b/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py
@@ -155,6 +155,6 @@ def extract_hierarchy(link):
     return ';'.join(hierarchy)
 
 def format_segment(segment):
-    if segment == 'c_cpp_api': segment = 'c_c++_api'
+    if segment == 'c_cpp_api': segment = 'C/C++_api'
 
     return ' '.join(word.capitalize() for word in segment.replace('-', ' ').replace('_', ' ').split())
\ No newline at end of file
diff --git a/docs/sphinx_setup/_static/js/custom.js b/docs/sphinx_setup/_static/js/custom.js
index 241f8895ee1c61..95f9549959e102 100644
--- a/docs/sphinx_setup/_static/js/custom.js
+++ b/docs/sphinx_setup/_static/js/custom.js
@@ -189,7 +189,7 @@ function getCurrentVersion() {
     if (wordAfterDomain === 'cn') {
         wordAfterDomain = link[2];
     }
-    if (["index.html", "404.html", "", "latest"].indexOf(wordAfterDomain) >= 0) {
+    if (["index.html", "404.html", ""].indexOf(wordAfterDomain) >= 0) {
         /*
         * If this landing page, 404 or domain.com we should get first version
         * */
@@ -426,7 +426,7 @@ document.addEventListener('DOMContentLoaded', function () {
         const searchInterfaceSa = document.querySelector("#sa-search");
         const searchInterface = document.querySelector("#search");
         const currentVersion = getCurrentVersion();
-    
+        
         await initializeSearchInterface(searchInterfaceSa, currentVersion);
         await initializeSearchInterface(searchInterface);
     

From be0ab30ac93be815a34ee20a92348b3220bbf5e1 Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Tue, 10 Dec 2024 11:56:06 +0400
Subject: [PATCH 23/23] [JAX FE] Support square operation (#27978)

**Details:** It appears since JAX 0.4.36

**Ticket:** 158994

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 src/frontends/jax/src/op/square.cpp        | 28 ++++++++++++++
 src/frontends/jax/src/op_table.cpp         |  2 +
 tests/constraints.txt                      |  6 +--
 tests/layer_tests/jax_tests/test_square.py | 44 ++++++++++++++++++++++
 4 files changed, 77 insertions(+), 3 deletions(-)
 create mode 100644 src/frontends/jax/src/op/square.cpp
 create mode 100644 tests/layer_tests/jax_tests/test_square.py

diff --git a/src/frontends/jax/src/op/square.cpp b/src/frontends/jax/src/op/square.cpp
new file mode 100644
index 00000000000000..268debb7992ba8
--- /dev/null
+++ b/src/frontends/jax/src/op/square.cpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/frontend/jax/node_context.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/power.hpp"
+#include "openvino/op/squeeze.hpp"
+#include "utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace jax {
+namespace op {
+
+using namespace ov::op;
+
+OutputVector translate_square(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+    auto x = context.get_input(0);
+    auto const_two = create_same_type_const_scalar<int64_t>(x, 2);
+    return {std::make_shared<v1::Power>(x, const_two)};
+};
+
+}  // namespace op
+}  // namespace jax
+}  // namespace frontend
+}  // namespace ov
diff --git a/src/frontends/jax/src/op_table.cpp b/src/frontends/jax/src/op_table.cpp
index 98f22452c5afab..3ca58745bc1909 100644
--- a/src/frontends/jax/src/op_table.cpp
+++ b/src/frontends/jax/src/op_table.cpp
@@ -53,6 +53,7 @@ OP_CONVERTER(translate_reduce_window_sum);
 OP_CONVERTER(translate_reshape);
 OP_CONVERTER(translate_rsqrt);
 OP_CONVERTER(translate_slice);
+OP_CONVERTER(translate_square);
 OP_CONVERTER(translate_squeeze);
 OP_CONVERTER(translate_transpose);
 
@@ -92,6 +93,7 @@ const std::map<std::string, CreatorFunction> get_supported_ops_jaxpr() {
             {"rsqrt", op::translate_rsqrt},
             {"reshape", op::translate_reshape},
             {"slice", op::translate_slice},
+            {"square", op::translate_square},
             {"sqrt", op::translate_1to1_match_1_input<v0::Sqrt>},
             {"squeeze", op::translate_squeeze},
             {"stop_gradient", op::skip_node},
diff --git a/tests/constraints.txt b/tests/constraints.txt
index 004a2c65b5e474..4f46cd0cc8b2e9 100644
--- a/tests/constraints.txt
+++ b/tests/constraints.txt
@@ -21,11 +21,11 @@ pytest>=5.0,<8.4
 pytest-dependency==0.5.1
 pytest-html==4.1.1
 pytest-timeout==2.3.1
-jax<=0.4.35
-jaxlib<=0.4.35
+jax<=0.4.36
+jaxlib<=0.4.36
 kornia==0.7.0
 networkx<=3.3
-flax<=0.10.0
+flax<=0.10.2
 
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch~=2.5.1; platform_system != "Darwin" or platform_machine != "x86_64"
diff --git a/tests/layer_tests/jax_tests/test_square.py b/tests/layer_tests/jax_tests/test_square.py
new file mode 100644
index 00000000000000..32e842d182e90e
--- /dev/null
+++ b/tests/layer_tests/jax_tests/test_square.py
@@ -0,0 +1,44 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import jax
+import numpy as np
+import pytest
+from jax import numpy as jnp
+
+from jax_layer_test_class import JaxLayerTest
+
+rng = np.random.default_rng(34455)
+
+
+class TestSquare(JaxLayerTest):
+    def _prepare_input(self):
+        if np.issubdtype(self.input_type, np.floating):
+            x = rng.uniform(-8.0, 8.0, self.input_shape).astype(self.input_type)
+        elif np.issubdtype(self.input_type, np.signedinteger):
+            x = rng.integers(-8, 8, self.input_shape).astype(self.input_type)
+        else:
+            x = rng.integers(0, 8, self.input_shape).astype(self.input_type)
+        x = jnp.array(x)
+        return [x]
+
+    def create_model(self, input_shape, input_type):
+        self.input_shape = input_shape
+        self.input_type = input_type
+
+        def jax_square(x):
+            return jax.numpy.square(x)
+
+        return jax_square, None, None
+
+    @pytest.mark.parametrize("input_shape", [[2], [3, 4]])
+    @pytest.mark.parametrize("input_type", [np.int8, np.uint8, np.int16, np.uint16,
+                                            np.int32, np.uint32, np.int64, np.uint64,
+                                            np.float16, np.float32, np.float64])
+    @pytest.mark.nightly
+    @pytest.mark.precommit
+    @pytest.mark.precommit_jax_fe
+    def test_square(self, ie_device, precision, ir_version, input_shape, input_type):
+        self._test(*self.create_model(input_shape, input_type),
+                   ie_device, precision,
+                   ir_version)