From 6198275ac80104adf33abffdf4e3eae313465f7a Mon Sep 17 00:00:00 2001
From: Roman Lyamin <Roman.Lyamin@intel.com>
Date: Thu, 17 Oct 2024 11:39:08 +0400
Subject: [PATCH 01/32] [GPU] Added tests for LoRA with empty adapters and
 handling of incorrect fusings (#27093)

### Tickets:
 - *[152852](https://jira.devtools.intel.com/browse/CVS-152852)*
---
 .../prepare_primitive_fusing.cpp              | 22 +++++++++++++------
 .../intel_gpu/src/graph/input_layout.cpp      |  6 ++++-
 .../intel_gpu/src/graph/primitive_inst.cpp    | 14 +++++++++++-
 .../subgraph_tests/lora_pattern.cpp           | 21 ++++++++++++++++++
 4 files changed, 54 insertions(+), 9 deletions(-)
 create mode 100644 src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
index 5e8380f35dcb93..c38fa70e86ccef 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -1048,17 +1048,25 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                 std::swap(fused_idx, peer_idx);
             }
 
+            auto fused_node = parents[fused_idx].first;
+            auto peer_node = parents[peer_idx].first;
+
             // Avoid fusing with GEMM from the LoRA pattern, that can be optimized in case of empty adapters
-            if (parents[fused_idx].first->is_type<gemm>()) {
-                if (parents[peer_idx].first->is_type<fully_connected>() ||
-                    (parents[peer_idx].first->is_type<crop>() &&
-                     parents[peer_idx].first->get_dependency(0).is_type<fully_connected>())) {
-                    std::swap(fused_idx, peer_idx);
+            if (fused_node->is_type<gemm>()) {
+                bool is_fc_lora = peer_node->is_type<fully_connected>() ||
+                                  (peer_node->is_type<crop>() &&
+                                   peer_node->get_dependency(0).is_type<fully_connected>());
+
+                bool is_conv_lora = peer_node->is_type<convolution>();
+
+                bool is_gemm_lora = peer_node->is_type<gemm>() &&
+                                    fused_node->get_input_pshape().rbegin()->is_dynamic();
+
+                if (is_fc_lora || is_conv_lora || is_gemm_lora) {
+                    std::swap(peer_node, fused_node);
                 }
             }
 
-            auto fused_node = parents[fused_idx].first;
-            auto peer_node = parents[peer_idx].first;
             if (lo.get_optimization_attributes().use_onednn_impls && lo.is_primitive_implemented_for_onednn(*fused_node)) {
                 auto eltw_in_size = peer_node->get_output_layout();
                 if (eltw_in_size.is_dynamic()
diff --git a/src/plugins/intel_gpu/src/graph/input_layout.cpp b/src/plugins/intel_gpu/src/graph/input_layout.cpp
index 69cf2e7f834d2d..042744517a7c3e 100644
--- a/src/plugins/intel_gpu/src/graph/input_layout.cpp
+++ b/src/plugins/intel_gpu/src/graph/input_layout.cpp
@@ -37,7 +37,11 @@ input_layout_inst::typed_primitive_inst(network& network, input_layout_node cons
 event::ptr input_layout_inst::set_data(memory::ptr mem) {
     auto ol = get_node_output_layout();
 
-    check_memory_to_set(*mem, ol);
+    bool empty_mem = mem->size() == 0 && (ol.is_dynamic() || ol.count() == 0);
+    if (!empty_mem) {
+        check_memory_to_set(*mem, ol);
+    }
+
     event::ptr ev = nullptr;
     auto& engine = get_network().get_engine();
     auto& stream = get_network().get_stream();
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index 095dc5fd45fa52..f90d4e34b08cc2 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -1553,8 +1553,13 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
                     auto allocated_mem = d.first->output_memory_ptr();
                     auto actual_input_layout = d.first->get_output_layout();
                     auto& engine = _network.get_engine();
+                    cldnn::memory_ptr actual_mem = nullptr;
                     // Need to use actual layout, not the fake aligned memory layout
-                    auto actual_mem = engine.reinterpret_buffer(*allocated_mem, actual_input_layout);
+                    if (actual_input_layout.count() != 0) {
+                        actual_mem = engine.reinterpret_buffer(*allocated_mem, actual_input_layout);
+                    } else {
+                        actual_mem = engine.allocate_memory(actual_input_layout);
+                    }
                     subgraph->set_input_data(d.first->id(), std::move(actual_mem));
                 }
             }
@@ -2324,6 +2329,13 @@ bool primitive_inst::is_valid_fusion() const {
     if (fused_eltwise_prims.empty())
         return true;
 
+    if (_node->is_type<fully_connected>() || _node->is_type<gemm>() || _node->is_type<convolution>()) {
+        if (_impl_params->input_layouts[0].count() == 0 ||
+            _impl_params->input_layouts[1].count() == 0) {
+            return false;
+        }
+    }
+
     if (_node->is_type<fully_connected>() && _node->get_preferred_impl_type() == impl_types::ocl) {
         // TODO: Only fc_bf_tiled_kernel & ref kernel are verified for fused eltwise. To support more fc kernels for eltwise fusion
         if (!_node->get_selected_impl())
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp
new file mode 100644
index 00000000000000..7bb6fbd610df29
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp
@@ -0,0 +1,21 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_tests/lora_pattern.hpp"
+
+using namespace ov::test;
+
+namespace {
+
+INSTANTIATE_TEST_SUITE_P(smoke,
+                         LoraPatternConvolution,
+                         ::testing::Values(ov::test::utils::DEVICE_GPU),
+                         LoraPatternBase::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke,
+                         LoraPatternMatmul,
+                         ::testing::Values(ov::test::utils::DEVICE_GPU),
+                         LoraPatternBase::getTestCaseName);
+
+}  // namespace

From 675dc6e6cb30760c31a162f4f2f2ac5e99afa183 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi3.zhang@intel.com>
Date: Thu, 17 Oct 2024 15:43:10 +0800
Subject: [PATCH 02/32] [CPU]support glm4 rope (#27094)

### Details:
 - *Support Rope kernel of GLM4*
- *the input data order has changed from (**[seq_length, batch, 4608]**)
in **ChatGLM3** to (**[batch, seq_length, 4608]**) in **ChatGLM4**.
Within RoPE process, the data order changes from (**[seq_length, batch,
head_count, head_size]**) to (**[batch, head_count, seq_length,
head_size]**) by permute operation added in **ChatGLM4**.*
- *the RoPE cache data order has changed from (**[seq_length, batch,
head_count, 2]**) in ChatGLM3 to (**[batch, head_count, seq_length,
2]**) in **ChatGLM4**.*
- *Consequently, the output of RoPE has also changed from
(**[seq_length, batch, head_count, head_size]**) in **ChatGLM3** to
(**[batch, head_count, seq_length, head_size]**) in **ChatGLM4***
- *Due to these changes, the RoPE pattern matching needs to create
something new, something different from what already existed ChatGLM
pattern matching. Additionally, new kernels need to be added to
accommodate these changes*

### Tickets:
 - *ticket-id*
---
 src/plugins/intel_cpu/src/nodes/rope.cpp      | 83 +++++++++++++------
 .../transformation_pipeline.cpp               |  4 +-
 .../subgraph_tests/rotary_pos_emb.cpp         |  6 ++
 3 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/rope.cpp b/src/plugins/intel_cpu/src/nodes/rope.cpp
index ac95b0f31213de..f089b67a122beb 100644
--- a/src/plugins/intel_cpu/src/nodes/rope.cpp
+++ b/src/plugins/intel_cpu/src/nodes/rope.cpp
@@ -244,34 +244,67 @@ struct RoPE::RoPEExecutorChatGLM : public RoPE::Executor {
         if (m_config.slice_stop - m_config.slice_start > 0) {
             t_src = t_src.slice(2, m_config.slice_start, m_config.slice_stop);
         }
-        auto seq_len = t_src.size(0);
-        auto batch_size = t_src.size(1);
-
-        auto head_cnt = m_config.head_cnt;
-        auto head_size = m_config.head_size;
-
-        auto rotary_dims = m_config.rotary_ndims;
-
-        parallel_for3d(seq_len, batch_size, head_cnt, [&](size_t p, size_t b, size_t h) {
-            auto* src = t_src.ptr<T>(p, b, h * head_size);
-            // [length, batch_size, ndims//2, 2]
-            auto* cos_sin = &t_cos_sin.at<float>({p, b, 0, 0}, true);
-            auto* dst = t_dst.ptr<T>(p, b, h, 0);
+        if (m_config.support_2d_rope) {
+            // src [batch, length, H x S]
+            auto seq_len = t_src.size(1);
+            auto batch_size = t_src.size(0);
+
+            auto head_cnt = m_config.head_cnt;
+            auto head_size = m_config.head_size;
+
+            auto rotary_dims = m_config.rotary_ndims;
+
+            parallel_for3d(batch_size, head_cnt, seq_len, [&](size_t b, size_t h, size_t p) {
+                // src [batch, length, H x S]
+                auto* src = t_src.ptr<T>(b, p, h * head_size);
+                // [batch_size, length, ndims//2, 2]
+                auto* cos_sin = &t_cos_sin.at<float>({b, p, 0, 0}, true);
+                auto* dst = t_dst.ptr<T>(b, h, p, 0);
+
+                if (m_rotaryKernel) {
+                    execJitKernel(m_rotaryKernel, src, dst, cos_sin, nullptr);
+                } else {
+                    size_t i = 0;
+                    for (; i < rotary_dims; i += 2) {
+                        auto cosv = cos_sin[i];
+                        auto sinv = cos_sin[i + 1];
+                        dst[i] = cosv * src[i] - sinv * src[i + 1];
+                        dst[i + 1] = sinv * src[i] + cosv * src[i + 1];
+                    }
+                }
 
-            if (m_rotaryKernel) {
-                execJitKernel(m_rotaryKernel, src, dst, cos_sin, nullptr);
-            } else {
-                size_t i = 0;
-                for (; i < rotary_dims; i += 2) {
-                    auto cosv = cos_sin[i];
-                    auto sinv = cos_sin[i + 1];
-                    dst[i] = cosv * src[i] - sinv * src[i + 1];
-                    dst[i + 1] = sinv * src[i] + cosv * src[i + 1];
+                memcpy(dst + rotary_dims, src + rotary_dims, (head_size - rotary_dims) * sizeof(T));
+            });
+        } else {
+            auto seq_len = t_src.size(0);
+            auto batch_size = t_src.size(1);
+
+            auto head_cnt = m_config.head_cnt;
+            auto head_size = m_config.head_size;
+
+            auto rotary_dims = m_config.rotary_ndims;
+
+            parallel_for3d(seq_len, batch_size, head_cnt, [&](size_t p, size_t b, size_t h) {
+                auto* src = t_src.ptr<T>(p, b, h * head_size);
+                // [length, batch_size, ndims//2, 2]
+                auto* cos_sin = &t_cos_sin.at<float>({p, b, 0, 0}, true);
+                auto* dst = t_dst.ptr<T>(p, b, h, 0);
+
+                if (m_rotaryKernel) {
+                    execJitKernel(m_rotaryKernel, src, dst, cos_sin, nullptr);
+                } else {
+                    size_t i = 0;
+                    for (; i < rotary_dims; i += 2) {
+                        auto cosv = cos_sin[i];
+                        auto sinv = cos_sin[i + 1];
+                        dst[i] = cosv * src[i] - sinv * src[i + 1];
+                        dst[i + 1] = sinv * src[i] + cosv * src[i + 1];
+                    }
                 }
-            }
 
-            memcpy(dst + rotary_dims, src + rotary_dims, (head_size - rotary_dims) * sizeof(T));
-        });
+                memcpy(dst + rotary_dims, src + rotary_dims, (head_size - rotary_dims) * sizeof(T));
+            });
+        }
     }
 };
 
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 0e683482a97934..04808baaebec54 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -835,8 +835,8 @@ void Transformations::PostLpt() {
     // Execute before snippets. Otherwise FQ will be converted to Subgraph
     CPU_REGISTER_PASS_X64(postLPTPassManager, ConvertFqRnnToQuantizedRnn);
 
-    CPU_REGISTER_PASS_X64(postLPTPassManager, ov::pass::RoPEFusion);
-    CPU_REGISTER_PASS_ARM64(postLPTPassManager, ov::pass::RoPEFusion);
+    CPU_REGISTER_PASS_X64(postLPTPassManager, ov::pass::RoPEFusion, true);
+    CPU_REGISTER_PASS_ARM64(postLPTPassManager, ov::pass::RoPEFusion, true);
     CPU_REGISTER_PASS_X64(postLPTPassManager, CausalMaskPreprocessFusion);
 
     // MLP & QKV fusion optimizations is focused on throughput, only enabled on AMX-bf16 & LLM serving use cases.
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp
index 7fd916e4300768..8cd8707e047878 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp
@@ -50,5 +50,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_RoPETestGPTJSlice,
                          ::testing::Combine(::testing::Values(true, false),
                                             ::testing::Values(ov::test::utils::DEVICE_CPU)),
                          RoPETestGPTJSlice::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLM,
+                         RoPETestChatGLM2DRoPEStridedSlice,
+                         ::testing::Values(ov::test::utils::DEVICE_CPU),
+                         RoPETestChatGLM2DRoPEStridedSlice::getTestCaseName);
+
 }  // namespace test
 }  // namespace ov

From 487b3910f6d611fc84c5cd78a96afb6c0f9bf914 Mon Sep 17 00:00:00 2001
From: Mateusz Mikolajczyk <mateusz.mikolajczyk@intel.com>
Date: Thu, 17 Oct 2024 12:36:41 +0200
Subject: [PATCH 03/32] [PyOV] Extend Python API with SliceScatter-15 (#27090)

### Details:
 - *Extend Python API with SliceScatter-15*
 - *...*

### Tickets:
 - *CVS-155151*

---------

Co-authored-by: Michal Lukaszewski <michal.lukaszewski@intel.com>
---
 .../src/openvino/runtime/opset15/__init__.py  |  1 +
 .../src/openvino/runtime/opset15/ops.py       | 29 +++++++++++++++++++
 .../python/tests/test_graph/test_create_op.py | 28 ++++++++++++++++++
 3 files changed, 58 insertions(+)

diff --git a/src/bindings/python/src/openvino/runtime/opset15/__init__.py b/src/bindings/python/src/openvino/runtime/opset15/__init__.py
index 1349508e84b381..96643a7e93d596 100644
--- a/src/bindings/python/src/openvino/runtime/opset15/__init__.py
+++ b/src/bindings/python/src/openvino/runtime/opset15/__init__.py
@@ -15,3 +15,4 @@
 from openvino.runtime.opset15.ops import string_tensor_unpack
 from openvino.runtime.opset15.ops import bitwise_left_shift
 from openvino.runtime.opset15.ops import bitwise_right_shift
+from openvino.runtime.opset15.ops import slice_scatter
diff --git a/src/bindings/python/src/openvino/runtime/opset15/ops.py b/src/bindings/python/src/openvino/runtime/opset15/ops.py
index 777fc165443f7f..116f63726bfeb6 100644
--- a/src/bindings/python/src/openvino/runtime/opset15/ops.py
+++ b/src/bindings/python/src/openvino/runtime/opset15/ops.py
@@ -274,3 +274,32 @@ def bitwise_right_shift(
             "auto_broadcast": auto_broadcast.upper(),
         },
     )
+
+
+@nameable_op
+def slice_scatter(
+    data: NodeInput,
+    updates: NodeInput,
+    start: NodeInput,
+    stop: NodeInput,
+    step: NodeInput,
+    axes: Optional[NodeInput] = None,
+    name: Optional[str] = None,
+) -> Node:
+    """Return a node which generates SliceScatter operation.
+
+    :param  data: The node providing input data.
+    :param  updates: The node providing updates data.
+    :param  start: The node providing start indices (inclusively).
+    :param  stop: The node providing stop indices (exclusively).
+    :param  step: The node providing step values.
+    :param  axes: The optional node providing axes to slice, default [0, 1, ..., len(start)-1].
+    :param  name: The optional name for the created output node.
+    :return: The new node performing SliceScatter operation.
+    """
+    if axes is None:
+        inputs = as_nodes(data, updates, start, stop, step, name=name)
+    else:
+        inputs = as_nodes(data, updates, start, stop, step, axes, name=name)
+
+    return _get_node_factory_opset15().create("SliceScatter", inputs)
diff --git a/src/bindings/python/tests/test_graph/test_create_op.py b/src/bindings/python/tests/test_graph/test_create_op.py
index dcdb8592390ad4..c5023588f5d55b 100644
--- a/src/bindings/python/tests/test_graph/test_create_op.py
+++ b/src/bindings/python/tests/test_graph/test_create_op.py
@@ -2458,6 +2458,34 @@ def test_topk_opset11(op_name):
     assert list(node.get_output_shape(1)) == [1, 3, 3]
 
 
+def test_slice_scatter():
+    data_shape = [10, 7, 2, 13]
+    data = ov.parameter(data_shape, name="input", dtype=np.float32)
+    updates = ov.parameter([4, 7, 2, 13], name="updates", dtype=np.float32)
+    start = ov.constant(np.array([2, 0, 0], dtype=np.int32))
+    stop = ov.constant(np.array([9, 7, 2], dtype=np.int32))
+    step = ov.constant(np.array([2, 1, 1], dtype=np.int32))
+
+    node_default_axes = ov_opset15.slice_scatter(data, updates, start, stop, step)
+
+    assert node_default_axes.get_type_name() == "SliceScatter"
+    assert node_default_axes.get_output_size() == 1
+    assert node_default_axes.get_output_element_type(0) == Type.f32
+    assert node_default_axes.get_output_shape(0) == data_shape
+
+    start = ov.constant(np.array([0, 2], dtype=np.int32))
+    stop = ov.constant(np.array([2, 9], dtype=np.int32))
+    step = ov.constant(np.array([1, 2], dtype=np.int32))
+    axes = ov.constant(np.array([-2, 0], dtype=np.int32))
+
+    node = ov_opset15.slice_scatter(data, updates, start, stop, step, axes)
+
+    assert node.get_type_name() == "SliceScatter"
+    assert node.get_output_size() == 1
+    assert node.get_output_element_type(0) == Type.f32
+    assert node_default_axes.get_output_shape(0) == data_shape
+
+
 def test_parameter_get_attributes():
     parameter = ov.parameter([2, 2], dtype=np.float32, name="InputData")
     parameter_attributes = parameter.get_attributes()

From a356945d15013eba28221d2ad5b1b17d52b570a0 Mon Sep 17 00:00:00 2001
From: pravin25 <pravinpatil770@gmail.com>
Date: Thu, 17 Oct 2024 07:02:38 -0400
Subject: [PATCH 04/32] Details: (#27008)

### Details:
GPU tensorflow_tests/test_tf_Unique.py test was failing, Added in
slice_ref_kernel following output datatype which was missing. k.
EnableOutputDataType(Datatype::UINT8);
And updated  tensorflow_tests/test_tf_Unique.py, removed skip for GPU.

### Tickets:
-
https://jira.devtools.intel.com/browse/CVS-105900?focusedId=25293445#comment-25293445
---
 .../src/kernel_selector/kernels/slice/slice_kernel_ref.cpp    | 1 +
 tests/layer_tests/tensorflow_tests/test_tf_Unique.py          | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp
index 34279dd7de148c..4aff7736ff85fe 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp
@@ -90,6 +90,7 @@ ParamsKey SliceKernelRef::GetSupportedKey() const {
     k.EnableOutputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::INT32);
     k.EnableOutputDataType(Datatype::INT64);
+    k.EnableOutputDataType(Datatype::UINT8);
     k.EnableInputLayout(DataLayout::bfyx);
     k.EnableInputLayout(DataLayout::bfzyx);
     k.EnableOutputLayout(DataLayout::bfyx);
diff --git a/tests/layer_tests/tensorflow_tests/test_tf_Unique.py b/tests/layer_tests/tensorflow_tests/test_tf_Unique.py
index 6e18c900328aa3..2a3082abebbc6f 100644
--- a/tests/layer_tests/tensorflow_tests/test_tf_Unique.py
+++ b/tests/layer_tests/tensorflow_tests/test_tf_Unique.py
@@ -42,8 +42,6 @@ def test_unique_basic(self, params, ie_device, precision, ir_version, temp_dir,
                           use_legacy_frontend):
         if use_legacy_frontend:
             pytest.skip("Unique operation is not supported via legacy frontend.")
-        if ie_device == 'GPU':
-            pytest.skip("GPU error: Could not find a suitable kernel for slice")
         self._test(*self.create_unique_net(**params),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
                    use_legacy_frontend=use_legacy_frontend)
@@ -59,8 +57,6 @@ def test_unique_other_types(self, params, ie_device, precision, ir_version, temp
                                 use_legacy_frontend):
         if use_legacy_frontend:
             pytest.skip("Unique operation is not supported via legacy frontend.")
-        if ie_device == 'GPU':
-            pytest.skip("GPU error: Could not find a suitable kernel for slice")
         self._test(*self.create_unique_net(**params),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
                    use_legacy_frontend=use_legacy_frontend)

From 6644fc81f5942cdab2e8d863cd6b721620980faa Mon Sep 17 00:00:00 2001
From: Piotr Kowalczyk <piotr.kowalczyk@intel.com>
Date: Thu, 17 Oct 2024 13:09:35 +0200
Subject: [PATCH 05/32] [Spec]: Added initial specification of Searchsorted op
 (#26887)

### Details:
- Based on
https://pytorch.org/docs/stable/generated/torch.searchsorted.html

### Tickets:
 - *CVS-154060*

---------

Co-authored-by: Roman Kazantsev <roman.kazantsev@intel.com>
Co-authored-by: Przemyslaw Wysocki <przemyslaw.wysocki@intel.com>
Co-authored-by: Katarzyna Mitrus <katarzyna.mitrus@intel.com>
---
 .../operation-sets/operation-specs.rst        |  1 +
 .../operation-specs/sort/search-sorted-15.rst | 73 +++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst

diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst
index 2d03cf7cdce069..7ac47116595621 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst
@@ -197,6 +197,7 @@ Operation Specifications
    ScatterElementsUpdate-12 <operation-specs/movement/scatter-elements-update-12>
    ScatterNDUpdate-3 <operation-specs/movement/scatter-nd-update-3>
    ScatterUpdate-3 <operation-specs/movement/scatter-update-3>
+   SearchSorted-15 <operation-specs/sort/search-sorted-15>
    Select-1 <operation-specs/condition/select-1>
    Selu-1 <operation-specs/activation/selu-1>
    ShapeOf-1 <operation-specs/shape/shape-of-1>
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst
new file mode 100644
index 00000000000000..81c592d3341a35
--- /dev/null
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst
@@ -0,0 +1,73 @@
+SearchSorted
+===============
+
+
+.. meta::
+  :description: Learn about SearchSorted - a sorting and maximization
+                operation, which requires two input tensors.
+
+
+**Versioned name**: *SearchSorted-15*
+
+**Category**: *Sorting and maximization*
+
+**Short description**: Determines the indices in the innermost dimension of a sorted sequence where elements should be inserted to maintain order.
+
+**Detailed description**: *SearchSorted* operation determines the indices in the innermost dimension of a sorted sequence where elements should be inserted to maintain order. The operation is based on the binary search algorithm. The operation is performed on two input tensors: the first tensor contains a monotonically increasing sequence on the innermost dimension, and the second tensor contains the search values. The operation returns a tensor with the same shape as the second input tensor, containing the indices.
+
+**Attributes**
+
+* *right*
+
+  * **Description**: If False, set the first suitable index. If True, return the last suitable index for given value. Default is False.
+  * **Range of values**: true or false
+  * **Type**: boolean
+  * **Default value**: false
+  * **Required**: *no*
+
+**Inputs**:
+
+* **1**: ``sorted`` - ND input tensor of type *T* - cannot be a scalar, containing monotonically increasing sequence on the innermost dimension. **Required.**
+
+* **2**: ``values`` - ND input tensor of type *T*, containing the search values. If sorted sequence is 1D, then the values can have any shape, otherwise the rank should be equal to the rank of sorted input. **Required.**
+
+**Outputs**:
+
+* **1**: Tensor of type *TOut*, with the same shape as second input tensor, containing the indices.
+
+**Types**
+
+* *T*: any supported floating-point and integer type.
+
+* *TOut*: int64.
+
+**Example**
+
+.. code-block:: xml
+   :force:
+
+   <layer ... type="SearchSorted" ... >
+       <data right="True"/>
+       <input>
+           <port id="0">
+               <dim>7</dim>
+               <dim>256</dim>
+               <dim>200</dim>
+               <dim>200</dim>
+           </port>
+           <port id="1">
+               <dim>7</dim>
+               <dim>256</dim>
+               <dim>200</dim>
+               <dim>10</dim>
+           </port>
+       </input>
+       <output>
+           <port id="0" precision="I64">
+               <dim>7</dim>
+               <dim>256</dim>
+               <dim>200</dim>
+               <dim>10</dim>
+           </port>
+       </output>
+   </layer>

From 6dbca1f07afb290660efcd0fcb4552c4a4fe3eb3 Mon Sep 17 00:00:00 2001
From: Ekaterina Shiryaeva <ekaterina.shiriaeva@intel.com>
Date: Thu, 17 Oct 2024 13:43:08 +0200
Subject: [PATCH 06/32] NPUW: Disable AVX2 code with ENABLE_AVX2=OFF  (#26890)

### Details:
 - Disable AVX2 code with ENABLE_AVX2=OFF

### Tickets:
 - E-141645
---
 .../intel_npu/src/plugin/CMakeLists.txt       |   11 +-
 .../intel_npu/src/plugin/npuw/util.cpp        | 1402 +---------------
 .../intel_npu/src/plugin/npuw/util_xarch.cpp  | 1429 +++++++++++++++++
 .../intel_npu/src/plugin/npuw/util_xarch.hpp  |   88 +
 4 files changed, 1548 insertions(+), 1382 deletions(-)
 create mode 100644 src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp
 create mode 100644 src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp

diff --git a/src/plugins/intel_npu/src/plugin/CMakeLists.txt b/src/plugins/intel_npu/src/plugin/CMakeLists.txt
index 4b91e6d594cc20..749819b457c82c 100644
--- a/src/plugins/intel_npu/src/plugin/CMakeLists.txt
+++ b/src/plugins/intel_npu/src/plugin/CMakeLists.txt
@@ -66,9 +66,12 @@ target_include_directories(${TARGET_NAME}
         $<TARGET_PROPERTY:LevelZero::NPUExt,INTERFACE_INCLUDE_DIRECTORIES>
 )
 
-if(ENABLE_AVX2)
-    ov_avx2_optimization_flags(avx2_flags)
-    target_compile_options(${TARGET_NAME} PRIVATE "${avx2_flags}")
-endif()
+cross_compiled_file(${TARGET_NAME}
+        ARCH AVX2 ANY
+                    npuw/util_xarch.cpp
+        API         npuw/util_xarch.hpp
+        NAME        unpack_i4i8 unpack_u4i8 unpack_i4f16 unpack_i4f16_scale unpack_i4f16_z unpack_u4f16 unpack_u4f16_scale_zp unpack_u4f16_asymm_zp unpack_u4f16_z unpack_u4f32 unpack_i8f16 unpack_i8f16_scale unpack_u8f16 to_f16
+        NAMESPACE   ov::npuw::util::XARCH
+)
 
 ov_add_api_validator_post_build_step(TARGET ${NPU_PLUGIN_TARGET})
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
index 59851b00a5407b..1de8f4de4bdb4f 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
@@ -4,8 +4,6 @@
 
 #include "util.hpp"
 
-#include <immintrin.h>
-
 #include <intel_npu/al/config/config.hpp>
 #include <iomanip>
 #include <openvino/core/parallel.hpp>
@@ -17,10 +15,7 @@
 #include "openvino/op/transpose.hpp"
 #include "openvino/op/util/op_types.hpp"
 #include "openvino/runtime/make_tensor.hpp"  // get_tensor_impl
-
-#ifdef UNPACK_PROFILING
-#    include "tbb/concurrent_unordered_map.h"
-#endif
+#include "util_xarch.hpp"
 
 bool ov::npuw::util::is_set(const std::size_t sub_idx, const std::string& opt) {
     if (opt.empty() || opt == "NO") {
@@ -39,6 +34,16 @@ bool ov::npuw::util::is_set(const std::size_t sub_idx, const std::string& opt) {
     return false;
 }
 
+namespace {
+inline uint8_t hi4(uint8_t x) {
+    return x >> 4;
+}
+
+inline uint8_t lo4(uint8_t x) {
+    return x & 0xF;
+}
+}  // namespace
+
 ov::Tensor ov::npuw::util::tensor_from_const(const std::shared_ptr<ov::Node>& node) {
     NPUW_ASSERT(ov::op::util::is_constant(node));
     NPUW_ASSERT(node->outputs().size() == 1);
@@ -61,1346 +66,6 @@ std::string ov::npuw::util::fmt(std::size_t number, std::size_t total) {
     return ss.str();
 }
 
-namespace {
-
-inline int8_t hi4(int8_t x) {
-    return ((x & (1 << 7)) >> 4) | ((x & (1 << 6)) >> 4) | ((x & (1 << 5)) >> 4) | ((x & (1 << 4)) >> 4);
-}
-
-inline int8_t lo4(int8_t x) {
-    return (x & (1 << 3)) | (x & (1 << 2)) | (x & (1 << 1)) | (x & (1 << 0));
-}
-
-inline uint8_t hi4(uint8_t x) {
-    return x >> 4;
-}
-
-inline uint8_t lo4(uint8_t x) {
-    return x & 0xF;
-}
-
-inline int8_t upc(int8_t h) {
-    return h | (-((h & (1 << 3)) >> 3) & (-8));
-}
-
-// NOTE: This routine implements the NEW ORDER
-#define avx2_i4toi8(vinput, vout0, vout1)                                         \
-    {                                                                             \
-        __m256i himask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0xF0));    \
-        __m256i lomask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0x0F));    \
-        __m256i vsgmask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 1 << 3)); \
-        __m256i vzero = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0));        \
-        __m256i vextend = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, (-8)));   \
-                                                                                  \
-        __m256i vht = _mm256_and_si256(vinput, himask);                           \
-        __m256i vhi = _mm256_srli_epi16(vht, 4);                                  \
-        __m256i vlo = _mm256_and_si256(vinput, lomask);                           \
-                                                                                  \
-        __m256i vsghi = _mm256_srli_epi16(_mm256_and_si256(vhi, vsgmask), 3);     \
-        __m256i vsglo = _mm256_srli_epi16(_mm256_and_si256(vlo, vsgmask), 3);     \
-        __m256i vsubhi = _mm256_sub_epi8(vzero, vsghi);                           \
-        __m256i vsublo = _mm256_sub_epi8(vzero, vsglo);                           \
-        __m256i vhires = _mm256_or_si256(vhi, _mm256_and_si256(vsubhi, vextend)); \
-        __m256i vlores = _mm256_or_si256(vlo, _mm256_and_si256(vsublo, vextend)); \
-                                                                                  \
-        __m256i vunlo = _mm256_unpacklo_epi8(vlores, vhires);                     \
-        __m256i vunhi = _mm256_unpackhi_epi8(vlores, vhires);                     \
-        *vout0 = _mm256_permute2x128_si256(vunlo, vunhi, 0x20);                   \
-        *vout1 = _mm256_permute2x128_si256(vunlo, vunhi, 0x31);                   \
-    }
-
-inline __m128i avx2_i8tof16(__m128i vi8) {
-    __m256i i32vec = _mm256_cvtepi8_epi32(vi8);                 // extend:  8 x i8  -> 8 x i32 [256b of 256b]
-    __m256 f32vec = _mm256_cvtepi32_ps(i32vec);                 // convert: 8 x i32 -> 8 x f32 [256b of 256b]
-    return _mm256_cvtps_ph(f32vec, _MM_FROUND_TO_NEAREST_INT);  // convert: 8 x f32 -> 8 x f16 [128b]
-}
-
-inline __m128i avx2_i8tof16(__m128i vi8, __m256 s) {
-    __m256i i32vec = _mm256_cvtepi8_epi32(vi8);                 // extend:  8 x i8  -> 8 x i32 [256b of 256b]
-    __m256 f32vec = _mm256_cvtepi32_ps(i32vec);                 // convert: 8 x i32 -> 8 x f32 [256b of 256b]
-    __m256 f32scl = _mm256_mul_ps(f32vec, s);                   // scale:   8 x f32 -> 8 x f32 [256b of 256b]
-    return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT);  // convert: 8 x f32 -> 8 x f16 [128b]
-}
-
-inline __m128i avx2_u8tof16_hi(__m128i vu8, __m256 z, __m256 s) {
-    __m256i u32vec = _mm256_cvtepu8_epi32(vu8);                 // extend:   8 x u8  -> 8 x i32 [256b of 256b]
-    __m256 f32vec = _mm256_cvtepi32_ps(u32vec);                 // convert:  8 x i32 -> 8 x f32 [256b of 256b]
-    __m256 f32sub = _mm256_sub_ps(f32vec, z);                   // subtract: 8 x f32 -> 8 x f32 [256b of 256b]
-    __m256 f32scl = _mm256_mul_ps(f32sub, s);                   // scale:    8 x f32 -> 8 x f32 [256b of 256b]
-    return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT);  // convert: 8 x f32 -> 8 x f16 [128b]
-}
-
-inline __m128i avx2_u8tof16_lo(__m128i vu8, __m256 z, __m256 s) {
-    __m128i vu8h = _mm_bsrli_si128(vu8, 8);
-    return avx2_u8tof16_hi(vu8h, z, s);
-}
-
-inline __m128i avx2_u8tof16(__m128i vi8, __m256 z, __m256 s) {
-    __m256i i32vec = _mm256_cvtepu8_epi32(vi8);                 // extend:   8 x i8  -> 8 x i32 [256b of 256b]
-    __m256 f32vec = _mm256_cvtepi32_ps(i32vec);                 // convert:  8 x i32 -> 8 x f32 [256b of 256b]
-    __m256 f32sub = _mm256_sub_ps(f32vec, z);                   // subtract: 8 x f32 -> 8 x f32 [256b of 256b]
-    __m256 f32scl = _mm256_mul_ps(f32sub, s);                   // scale:    8 x f32 -> 8 x f32 [256b of 256b]
-    return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT);  // convert: 8 x f32 -> 8 x f16 [128b]
-}
-
-// NOTE: This routine implements the NEW ORDER
-inline void avx2_u4tof16(__m256i vinput, __m128i vout[8], __m256 zvalVec, __m256 svalVec[8]) {
-    // vinput -  64       x u4  elements - 256 bits
-    // vout[]  - 64 (8x8) x f16 elements
-
-    // NOTE: This is largely a copy of unpack_u4f16() {{
-    __m256i himask = _mm256_set1_epi8(static_cast<char>(0xF0));
-    __m256i lomask = _mm256_set1_epi8(static_cast<char>(0x0F));
-
-    // unpacking with interleaving
-    __m256i vht = _mm256_and_si256(vinput, himask);
-    __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4);         // 32 x i8 - Extracting High Nibbles
-    __m256i xmmUnpackedHi = _mm256_and_si256(vinput, lomask);  // 32 x i8 - Extracting Low Nibbles
-
-    // need 4 portions of 16 x i8 elements
-    __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo);       //  lower  16 x i8 - Lower 16 of High Nibbles
-    __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1);  //  higher 16 x i8 - Higher 16 of High Nibbles
-
-    __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi);       //  lower  16 x i8 - Lower 16 of Low Nibbles
-    __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1);  //  higher 16 x i8 - Higher 16 of Low Nibbles
-
-    // Rearranging of scales
-    __m256i indices = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
-    // Extracting all 64 scales as per the indices specified above
-    __m256 scale_v_rearranged[] = {_mm256_permutevar8x32_ps(svalVec[0], indices),
-                                   _mm256_permutevar8x32_ps(svalVec[1], indices),
-                                   _mm256_permutevar8x32_ps(svalVec[2], indices),
-                                   _mm256_permutevar8x32_ps(svalVec[3], indices),
-                                   _mm256_permutevar8x32_ps(svalVec[4], indices),
-                                   _mm256_permutevar8x32_ps(svalVec[5], indices),
-                                   _mm256_permutevar8x32_ps(svalVec[6], indices),
-                                   _mm256_permutevar8x32_ps(svalVec[7], indices)};
-
-    // Scaling should happen like this:
-    // low_nibble[0]->scale[0], high_nibble[0]->scale[1]...low_nibble[31]->scale[60],high_nibble[31]->scale[61]
-
-    // Extracting all the even-indexed scales for the low nibbles
-    __m256 scale_v_even[] = {
-        _mm256_permute2f128_ps(scale_v_rearranged[0], scale_v_rearranged[1], 0x20),
-        _mm256_permute2f128_ps(scale_v_rearranged[2], scale_v_rearranged[3], 0x20),
-        _mm256_permute2f128_ps(scale_v_rearranged[4], scale_v_rearranged[5], 0x20),
-        _mm256_permute2f128_ps(scale_v_rearranged[6], scale_v_rearranged[7], 0x20),
-    };
-
-    // Extracting all the odd-indexed scales for the high nibbles
-    __m256 scale_v_odd[] = {
-        _mm256_permute2f128_ps(scale_v_rearranged[0], scale_v_rearranged[1], 0x31),
-        _mm256_permute2f128_ps(scale_v_rearranged[2], scale_v_rearranged[3], 0x31),
-        _mm256_permute2f128_ps(scale_v_rearranged[4], scale_v_rearranged[5], 0x31),
-        _mm256_permute2f128_ps(scale_v_rearranged[6], scale_v_rearranged[7], 0x31),
-    };
-
-    // converting to 64 x f16
-    // Higher 16 of High Nibbles
-    __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, scale_v_odd[2]),
-                         avx2_u8tof16_lo(unpacked32LoLo, zvalVec, scale_v_odd[3])};
-    // Lower 16 of High Nibbles
-    __m128i f16LoHi[] = {avx2_u8tof16_hi(unpacked32LoHi, zvalVec, scale_v_odd[0]),
-                         avx2_u8tof16_lo(unpacked32LoHi, zvalVec, scale_v_odd[1])};
-    // Higher 16 of Low Nibbles
-    __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, scale_v_even[2]),
-                         avx2_u8tof16_lo(unpacked32HiLo, zvalVec, scale_v_even[3])};
-    // Lower 16 of Low Nibbles
-    __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, scale_v_even[0]),
-                         avx2_u8tof16_lo(unpacked32HiHi, zvalVec, scale_v_even[1])};
-
-    // interleaving back:
-    // Interleaving lower 8 of low nibbles with lower 8 of high nibbles and so on
-    vout[0] = _mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]);
-    vout[1] = _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]);
-    vout[2] = _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]);
-    vout[3] = _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]);
-    vout[4] = _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]);
-    vout[5] = _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]);
-    vout[6] = _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]);
-    vout[7] = _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1]);
-}
-
-inline __m256 avx2_load_scale(const int8_t* data, ov::element::Type type) {
-    if (type == ov::element::f32) {
-        return _mm256_set1_ps(*reinterpret_cast<const float*>(data));
-    } else {
-        NPUW_ASSERT(type == ov::element::f16);
-        float val{};
-        _mm_store_ss(&val, _mm_cvtph_ps(_mm_cvtsi32_si128(*reinterpret_cast<const int16_t*>(data))));
-        return _mm256_set1_ps(val);
-    }
-}
-
-inline float avx2_load_f32(const int8_t* data, ov::element::Type type) {
-    if (type == ov::element::f32) {
-        return *reinterpret_cast<const float*>(data);
-    } else {
-        NPUW_ASSERT(type == ov::element::f16);
-        float val{};
-        _mm_store_ss(&val, _mm_cvtph_ps(_mm_cvtsi32_si128(*reinterpret_cast<const int16_t*>(data))));
-        return val;
-    }
-}
-
-#ifdef UNPACK_PROFILING
-class UnpackStat {
-    tbb::concurrent_unordered_map<size_t, std::pair<size_t, uint64_t>> inferenceTimes;
-
-public:
-    UnpackStat() {}
-    void addRecord(size_t sz, size_t time) {
-        inferenceTimes[sz].first++;
-        inferenceTimes[sz].second += time;
-    }
-    ~UnpackStat() {
-        for (auto&& r : inferenceTimes) {
-            std::cout << "work: " << r.first  //<< ", stride: " << stride
-                      << " overall_time = " << r.second.second / 1000 << " [ms]"
-                      << " avg_atime = " << r.second.second / r.second.first << " [µs]\n";
-        }
-    }
-};
-
-static UnpackStat ustat;
-#    define UNPACK_START_TICK() std::chrono::steady_clock::time_point _begin_tick = std::chrono::steady_clock::now();
-#    define UNPACK_SAVE_TICK()                                                              \
-        std::chrono::steady_clock::time_point _end_tick = std::chrono::steady_clock::now(); \
-        ustat.addRecord(total, std::chrono::duration_cast<std::chrono::microseconds>(_end_tick - _begin_tick).count());
-#else
-#    define UNPACK_START_TICK()
-#    define UNPACK_SAVE_TICK()
-#endif
-
-void unpack_i4i8(const ov::SoPtr<ov::ITensor>& from,
-                 const ov::SoPtr<ov::ITensor>& to,
-                 const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-
-    // with vectorization above, we:
-    // - read  256 bits (= 32 bytes, = 64  i4 elements)
-    // - write 512 bits (= 64 bytes, = 64  i8 elements)
-    // per every iteration, what translates to (from->size() / 64) iterations
-
-    const std::size_t total = from->get_size();
-    int8_t const* pSrc = static_cast<int8_t*>(from->data());  // 2 x i4 elements
-    int8_t* pDst = static_cast<int8_t*>(to->data());          // 1 x i8 element
-    size_t stride = 64;
-
-    auto unpack_body = [pSrc, pDst](size_t index, size_t stride) {
-        size_t halfStride = stride >> 1;
-        int8_t const* pSrcLocal = pSrc + halfStride * index;
-        int8_t* pDstLocal = pDst + stride * index;
-
-        for (size_t j = 0; j < stride; j += 64) {
-            __m256i inv = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrcLocal));
-            __m256i* outv0 = reinterpret_cast<__m256i*>(pDstLocal);
-            __m256i* outv1 = reinterpret_cast<__m256i*>(pDstLocal + 32);
-
-            __m256i vout0, vout1;
-            avx2_i4toi8(inv, &vout0, &vout1);
-
-            _mm256_storeu_si256(outv0, vout0);
-            _mm256_storeu_si256(outv1, vout1);
-
-            pSrcLocal += 32;
-            pDstLocal += 64;
-        }
-    };
-    // ov work index / 64
-    if (unpack_options.nPartitions) {
-        std::size_t minPartitions;
-        if (!unpack_options.bStrictPartitioning) {
-            // some heuristics that every tbb thread workload has to have 2048 elements at least,
-            // so in terms of stride, it should be 64 * 2048
-            minPartitions = total / (64 * 2048);
-            minPartitions = std::max<std::size_t>(1u, minPartitions);
-            minPartitions = std::min(minPartitions, unpack_options.nPartitions);
-        } else {
-            minPartitions = unpack_options.nPartitions;
-        }
-
-        // calculating stride in elements - this stride give us nPartitions + 1  partitions
-        stride = static_cast<size_t>(total / minPartitions);
-
-        // stride has to be 64 elements aligned to avoid gaps between workloads
-        stride = (stride >> 6) << 6;
-        // if number of partitions to large comparing to workload, min supported stride still have to be clamped to 64
-        stride = stride < 64 ? 64 : stride;
-    }
-
-    UNPACK_START_TICK();
-
-    if (unpack_options.bUseOvParallelFor) {
-        ov::parallel_for(total / stride, [unpack_body, stride](size_t index) {
-            unpack_body(index, stride);
-        });
-    } else {
-        for (std::size_t index = 0; index < total / stride; index++) {
-            unpack_body(index, stride);
-        }
-    }
-    // handle tail
-    size_t tailOffset = (static_cast<size_t>(total / stride) * stride);
-    pSrc = static_cast<int8_t*>(from->data()) + (tailOffset >> 1);
-    pDst = static_cast<int8_t*>(to->data()) + tailOffset;
-
-    for (std::size_t index = 0; index < ((total % 64) >> 1); index++) {
-        *(pDst++) = upc(lo4(*(pSrc)));
-        *(pDst++) = upc(hi4(*(pSrc)));
-        pSrc++;
-    }
-    UNPACK_SAVE_TICK();
-}
-
-void unpack_u4i8(const ov::SoPtr<ov::ITensor>& from,
-                 const ov::SoPtr<ov::ITensor>& to,
-                 const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-
-    uint8_t const* pSrc = static_cast<uint8_t*>(from->data());  // 2 x u4 elements
-    int8_t* pDst = static_cast<int8_t*>(to->data());            // 1 x i8 element
-
-    const std::size_t total = from->get_size();
-    for (std::size_t index = 0; index < total; index += 2) {
-        pDst[0] = static_cast<int8_t>(lo4(*pSrc));  // LSB is [0] -- since OpenVINO 24.0!
-        pDst[1] = static_cast<int8_t>(hi4(*pSrc));  // MSB is [1] -- since OpenVINO 24.0!
-        pSrc++;
-        pDst += 2;
-    }
-}
-
-void unpack_i4f16(const ov::SoPtr<ov::ITensor>& from,
-                  const ov::SoPtr<ov::ITensor>& to,
-                  const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-
-    // This conversion combines i4toi8 (above) and i8tof16 (below). Here we
-    // - read    256  bits (= 32  bytes, = 64  i4  elements)
-    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
-    // per every iteration, what translates to (from->size() / 64) iterations
-
-    std::size_t total = to->get_size();
-    int8_t const* pSrc = static_cast<int8_t*>(from->data());  // 2 x i4  elements
-    int16_t* pDst = static_cast<int16_t*>(to->data());        // 1 x f16 element
-    // bool tailOnly = total < 64;
-
-    auto unpack_body = [pSrc, pDst](size_t index) {
-        int8_t const* pSrcLocal = pSrc + 32 * index;
-        int16_t* pDstLocal = pDst + 64 * index;
-
-        __m256i inv = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrcLocal));
-        __m128i* outv[8] = {
-            reinterpret_cast<__m128i*>(pDstLocal),
-            reinterpret_cast<__m128i*>(pDstLocal + 8),
-            reinterpret_cast<__m128i*>(pDstLocal + 16),
-            reinterpret_cast<__m128i*>(pDstLocal + 24),
-            reinterpret_cast<__m128i*>(pDstLocal + 32),
-            reinterpret_cast<__m128i*>(pDstLocal + 40),
-            reinterpret_cast<__m128i*>(pDstLocal + 48),
-            reinterpret_cast<__m128i*>(pDstLocal + 56),
-        };
-
-        __m256i vout0, vout1;
-        avx2_i4toi8(inv, &vout0, &vout1);
-
-        int8_t tmp[64];  // FIXME: Avoid it
-        __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp);
-        __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32);
-        _mm256_storeu_si256(tmpv0, vout0);
-        _mm256_storeu_si256(tmpv1, vout1);
-
-        __m128i i8vecs[8] = {
-            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)),
-            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)),
-            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)),
-            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)),
-            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)),
-            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)),
-            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)),
-            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)),
-        };
-
-        __m128i vresults[8] = {avx2_i8tof16(i8vecs[0]),
-                               avx2_i8tof16(i8vecs[1]),
-                               avx2_i8tof16(i8vecs[2]),
-                               avx2_i8tof16(i8vecs[3]),
-                               avx2_i8tof16(i8vecs[4]),
-                               avx2_i8tof16(i8vecs[5]),
-                               avx2_i8tof16(i8vecs[6]),
-                               avx2_i8tof16(i8vecs[7])};
-
-        _mm_storeu_si128(outv[0], vresults[0]);
-        _mm_storeu_si128(outv[1], vresults[1]);
-        _mm_storeu_si128(outv[2], vresults[2]);
-        _mm_storeu_si128(outv[3], vresults[3]);
-        _mm_storeu_si128(outv[4], vresults[4]);
-        _mm_storeu_si128(outv[5], vresults[5]);
-        _mm_storeu_si128(outv[6], vresults[6]);
-        _mm_storeu_si128(outv[7], vresults[7]);
-    };
-
-    if (unpack_options.bUseOvParallelFor) {
-        ov::parallel_for(total / 64, [&unpack_body](size_t index) {
-            unpack_body(index);
-        });
-    } else {
-        for (std::size_t index = 0; index < total / 64; index++) {
-            unpack_body(index);
-        }
-    }
-
-    // handle tail that is < 64 elements
-    size_t tailOffset = ((total >> 6) << 6);
-    pSrc = static_cast<int8_t*>(from->data()) + (tailOffset >> 1);
-    pDst = static_cast<int16_t*>(to->data()) + tailOffset;
-
-    constexpr std::size_t VECSIZE = 8;
-
-    total = ((total % 64) >> 1);
-    int8_t unpackedToI8[VECSIZE] = {0};
-    size_t unpackedIdx = 0;
-    for (std::size_t index = 0; index < total; index++) {
-        unpackedToI8[unpackedIdx++] = upc(lo4(*(pSrc)));
-        unpackedToI8[unpackedIdx++] = upc(hi4(*(pSrc)));
-        if (unpackedIdx == VECSIZE) {
-            __m128i i8vec = _mm_loadl_epi64(reinterpret_cast<__m128i*>(unpackedToI8));
-            __m128i f16vec = avx2_i8tof16(i8vec);
-            _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst), f16vec);
-            pDst += VECSIZE;
-            unpackedIdx = 0;
-        }
-        pSrc += 1;
-    }
-
-    // handle tail that is < 8
-    if (unpackedIdx != 0) {
-        int16_t tmp[VECSIZE];
-        __m128i i8vec = _mm_loadl_epi64(reinterpret_cast<__m128i*>(unpackedToI8));
-        __m128i f16vec = avx2_i8tof16(i8vec);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), f16vec);
-        for (size_t i = 0; i != unpackedIdx; i++) {
-            pDst[i] = tmp[i];
-        }
-    }
-}
-
-void unpack_i4f16(const ov::SoPtr<ov::ITensor>& from,
-                  const ov::SoPtr<ov::ITensor>& scale,
-                  const ov::SoPtr<ov::ITensor>& to,
-                  const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(scale->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-
-    const auto& from_shape = from->get_shape();
-    NPUW_ASSERT(from_shape.back() % 64 == 0);
-
-    // 2-channel (Symmetric) and 3-channel (group-wise)
-    // scale factors are supported. The scale/value loop
-    // iteration is based on stotal, so should work for
-    // both cases.
-    const auto& scale_shape = scale->get_shape();
-    NPUW_ASSERT(scale_shape.size() == 3 || scale_shape.size() == 2);
-    if (scale_shape.size() == 3) {
-        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
-        NPUW_ASSERT(scale_shape[1] == from_shape[1]);
-        NPUW_ASSERT(scale_shape[2] == 1);
-    } else {
-        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
-        NPUW_ASSERT(scale_shape[1] == 1);
-    }
-
-    const auto scale_elem_type = scale->get_element_type();
-    NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16);
-
-    // This conversion combines i4toi8 (above) and i8tof16 (below). Here we
-    // - read    256  bits (= 32  bytes, = 64  i4  elements)
-    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
-    // per every iteration, what translates to (from->size() / 64) iterations
-
-    const std::size_t total = to->get_size();
-    const std::size_t stotal = scale->get_size();
-    const std::size_t elementsPerScale = total / stotal;
-
-    // TODO: handle tails
-    NPUW_ASSERT(elementsPerScale % 64 == 0);
-
-    const int8_t* const pSrc = static_cast<int8_t*>(from->data());   // 2 x i4  elements
-    const int8_t* const pScl = static_cast<int8_t*>(scale->data());  // either f16 or f32
-    const int16_t* pDst = static_cast<int16_t*>(to->data());         // 1 x f16 element
-
-    auto unpack_body = [pSrc, pDst, pScl, elementsPerScale, scale_elem_type, stotal](std::size_t sindex,
-                                                                                     std::size_t stride) {
-        // number of vectorized operations per scale
-        size_t elementsPerScaleVectorized = elementsPerScale / 64;
-
-        int8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride;
-        int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride;
-        int16_t* pDstLocal = const_cast<int16_t*>(pDst) + 64 * elementsPerScaleVectorized * sindex * stride;
-
-        // if it is last iteration current stride can be smaller - lets check that
-        sindex *= stride;
-        const auto jobFinish = std::min(sindex + stride, stotal);
-
-        for (; sindex != jobFinish; sindex++) {
-            __m256 svec = avx2_load_scale(pSclLocal, scale_elem_type);
-            for (std::size_t index = 0; index < elementsPerScale; index += 64) {
-                __m256i inv = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrcLocal));
-                __m128i* outv[8] = {
-                    reinterpret_cast<__m128i*>(pDstLocal),
-                    reinterpret_cast<__m128i*>(pDstLocal + 8),
-                    reinterpret_cast<__m128i*>(pDstLocal + 16),
-                    reinterpret_cast<__m128i*>(pDstLocal + 24),
-                    reinterpret_cast<__m128i*>(pDstLocal + 32),
-                    reinterpret_cast<__m128i*>(pDstLocal + 40),
-                    reinterpret_cast<__m128i*>(pDstLocal + 48),
-                    reinterpret_cast<__m128i*>(pDstLocal + 56),
-                };
-
-                __m256i vout0, vout1;
-                avx2_i4toi8(inv, &vout0, &vout1);
-
-                int8_t tmp[64];  // FIXME: Avoid it
-                __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp);
-                __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32);
-                _mm256_storeu_si256(tmpv0, vout0);
-                _mm256_storeu_si256(tmpv1, vout1);
-
-                __m128i i8vecs[8] = {
-                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)),
-                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)),
-                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)),
-                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)),
-                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)),
-                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)),
-                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)),
-                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)),
-                };
-
-                __m128i vresults[8] = {avx2_i8tof16(i8vecs[0], svec),
-                                       avx2_i8tof16(i8vecs[1], svec),
-                                       avx2_i8tof16(i8vecs[2], svec),
-                                       avx2_i8tof16(i8vecs[3], svec),
-                                       avx2_i8tof16(i8vecs[4], svec),
-                                       avx2_i8tof16(i8vecs[5], svec),
-                                       avx2_i8tof16(i8vecs[6], svec),
-                                       avx2_i8tof16(i8vecs[7], svec)};
-
-                _mm_storeu_si128(outv[0], vresults[0]);
-                _mm_storeu_si128(outv[1], vresults[1]);
-                _mm_storeu_si128(outv[2], vresults[2]);
-                _mm_storeu_si128(outv[3], vresults[3]);
-                _mm_storeu_si128(outv[4], vresults[4]);
-                _mm_storeu_si128(outv[5], vresults[5]);
-                _mm_storeu_si128(outv[6], vresults[6]);
-                _mm_storeu_si128(outv[7], vresults[7]);
-
-                pSrcLocal += 32;  // shift pSrc only by 32 since it is 64 x i4
-                pDstLocal += 64;  // note pDst is int16_t
-            }
-            pSclLocal += scale_elem_type.size();
-        }
-    };
-    size_t stride{1};
-
-    // since scaling is always 64 elements aligned operations, lets partition only in scale shape
-    if (unpack_options.nPartitions) {
-        std::size_t minPartitions;
-        if (!unpack_options.bStrictPartitioning) {
-            // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least,
-            // so in terms of stride, it should be nElementsPerscale/64 * 2048
-            const auto nIntrinsicsPerScale = elementsPerScale / 64u;
-            auto minScaleStride = 2048u / nIntrinsicsPerScale;
-            minScaleStride = std::max<std::size_t>(1u, minScaleStride);
-            minPartitions = stotal / minScaleStride;
-            minPartitions = std::max<std::size_t>(1u, minPartitions);
-            minPartitions = std::min(minPartitions, unpack_options.nPartitions);
-        } else {
-            minPartitions = unpack_options.nPartitions;
-        }
-
-        // calculating stride in scale elements space
-        stride = static_cast<size_t>(stotal / minPartitions);
-    }
-
-    const size_t numWork = (stotal + stride - 1) / stride;
-
-    if (unpack_options.bUseOvParallelFor) {
-        ov::parallel_for(numWork, [unpack_body, stride](size_t index) {
-            unpack_body(index, stride);
-        });
-    } else {
-        for (std::size_t index = 0; index < numWork; index++) {
-            unpack_body(index, stride);
-        }
-    }
-}
-
-void unpack_i4f16_z(const ov::SoPtr<ov::ITensor>& from,
-                    const ov::SoPtr<ov::ITensor>& scale,
-                    const ov::SoPtr<ov::ITensor>& to,
-                    const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(scale->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-
-    const auto& from_shape = from->get_shape();
-    NPUW_ASSERT(from_shape.back() % 64 == 0);
-
-    const auto& scale_shape = scale->get_shape();
-    NPUW_ASSERT(scale_shape.size() == 3);
-    NPUW_ASSERT(scale_shape[0] == from_shape[0]);
-    NPUW_ASSERT(scale_shape[2] == from_shape[2]);
-    NPUW_ASSERT(scale_shape[1] == 1);
-
-    const auto scale_elem_type = scale->get_element_type();
-    NPUW_ASSERT(scale_elem_type == ov::element::f32);
-
-    // This conversion combines i4tof32 and f32tof16. Here we
-    // - read    256  bits (= 32  bytes, = 64  u4  elements)
-    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
-    // per every iteration, what translates to (from->size() / 64) iterations
-
-    const size_t C = from_shape[from_shape.size() - 3];
-    const size_t H = from_shape[from_shape.size() - 2];
-    const size_t W = from_shape[from_shape.size() - 1];
-
-    const int8_t* const pSrc = static_cast<int8_t*>(from->data());  // 2 x i4  elements
-    const float* const pScl = static_cast<float*>(scale->data());   // 1 x f32 element
-    int16_t* pDst = static_cast<int16_t*>(to->data());              // 1 x f16 element
-
-    auto unpack_body = [&](size_t job_index, size_t stride) {
-        size_t start_c = job_index * stride;
-        size_t end_c = std::min(C, start_c + stride);
-
-        for (size_t c = start_c; c < end_c; ++c) {
-            for (size_t h = 0; h < H; ++h) {
-                for (size_t w = 0; w < W; w += 64) {
-                    const int8_t* pSrc_iter = pSrc + (w + W * h + W * H * c) / 2;
-                    __m256i vinput = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrc_iter));
-                    __m256i vout0, vout1;
-                    avx2_i4toi8(vinput, &vout0, &vout1);
-                    int8_t tmp[64];  // FIXME: Avoid it
-                    __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp);
-                    __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32);
-                    _mm256_storeu_si256(tmpv0, vout0);
-                    _mm256_storeu_si256(tmpv1, vout1);
-                    __m128i i8vecs[8] = {
-                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)),
-                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)),
-                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)),
-                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)),
-                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)),
-                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)),
-                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)),
-                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)),
-                    };
-
-                    const float* pScl_iter = pScl + w + W * c;
-                    __m256 svalVec[8];
-                    for (int i = 0; i < 8; ++i) {
-                        svalVec[i] = _mm256_loadu_ps(pScl_iter + i * 8);
-                    }
-
-                    __m128i vresults[8] = {avx2_i8tof16(i8vecs[0], svalVec[0]),
-                                           avx2_i8tof16(i8vecs[1], svalVec[1]),
-                                           avx2_i8tof16(i8vecs[2], svalVec[2]),
-                                           avx2_i8tof16(i8vecs[3], svalVec[3]),
-                                           avx2_i8tof16(i8vecs[4], svalVec[4]),
-                                           avx2_i8tof16(i8vecs[5], svalVec[5]),
-                                           avx2_i8tof16(i8vecs[6], svalVec[6]),
-                                           avx2_i8tof16(i8vecs[7], svalVec[7])};
-
-                    int16_t* pDst_iter = pDst + w + W * h + W * H * c;
-                    for (int i = 0; i < 8; ++i) {
-                        _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst_iter + i * 8), vresults[i]);
-                    }
-                }
-            }
-        }
-    };
-
-    size_t stride = C;
-    size_t num_jobs = 1;
-
-    if (unpack_options.nPartitions) {
-        if (unpack_options.bStrictPartitioning) {
-            stride = (C + unpack_options.nPartitions - 1) / unpack_options.nPartitions;
-            num_jobs = unpack_options.nPartitions;
-        } else {
-            stride = std::max<size_t>(1, C / unpack_options.nPartitions);
-            num_jobs = (C + stride - 1) / stride;
-        }
-    }
-
-    if (unpack_options.bUseOvParallelFor) {
-        ov::parallel_for(num_jobs, [&](size_t job_index) {
-            unpack_body(job_index, stride);
-        });
-    } else {
-        for (size_t job_index = 0; job_index < num_jobs; ++job_index) {
-            unpack_body(job_index, stride);
-        }
-    }
-}
-
-void unpack_u4f16(const ov::SoPtr<ov::ITensor>& from,
-                  const ov::SoPtr<ov::ITensor>& to,
-                  const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-    NPUW_ASSERT(from->get_size() % 64 == 0);
-
-    // This conversion combines u4i8 and i8tof16 unpacks. Here we
-    // - read    256  bits (= 32  bytes, = 64  i4  elements)
-    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
-    // per every iteration, what translates to (from->size() / 64) iterations
-
-    const std::size_t total = to->get_size();
-    int8_t const* pSrc = static_cast<int8_t*>(from->data());  // 2 x i4  elements
-    int16_t* pDst = static_cast<int16_t*>(to->data());        // 1 x f16 element
-
-    for (std::size_t index = 0; index < total; index += 64) {
-        __m128i* outv[8] = {
-            reinterpret_cast<__m128i*>(pDst),
-            reinterpret_cast<__m128i*>(pDst + 8),
-            reinterpret_cast<__m128i*>(pDst + 16),
-            reinterpret_cast<__m128i*>(pDst + 24),
-            reinterpret_cast<__m128i*>(pDst + 32),
-            reinterpret_cast<__m128i*>(pDst + 40),
-            reinterpret_cast<__m128i*>(pDst + 48),
-            reinterpret_cast<__m128i*>(pDst + 56),
-        };
-
-        int8_t tmp[64];  // FIXME: Avoid it
-        for (std::size_t ii = 0; ii < 32; ii++) {
-            tmp[ii * 2] = static_cast<int8_t>(lo4(pSrc[ii]));      // LSB is [0] -- since OpenVINO 24.0!
-            tmp[ii * 2 + 1] = static_cast<int8_t>(hi4(pSrc[ii]));  // MSB is [1] -- since OpenVINO 24.0!
-        }
-
-        __m128i vresults[8] = {
-            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp))),
-            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8))),
-            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16))),
-            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24))),
-            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32))),
-            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40))),
-            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48))),
-            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56))),
-        };
-
-        _mm_storeu_si128(outv[0], vresults[0]);
-        _mm_storeu_si128(outv[1], vresults[1]);
-        _mm_storeu_si128(outv[2], vresults[2]);
-        _mm_storeu_si128(outv[3], vresults[3]);
-        _mm_storeu_si128(outv[4], vresults[4]);
-        _mm_storeu_si128(outv[5], vresults[5]);
-        _mm_storeu_si128(outv[6], vresults[6]);
-        _mm_storeu_si128(outv[7], vresults[7]);
-
-        pSrc += 32;  // shift pSrc only by 32 since it is 64 x i4
-        pDst += 64;  // note pDst is int16_t
-    }
-}
-
-void unpack_u4f16(const ov::SoPtr<ov::ITensor>& from,
-                  const ov::SoPtr<ov::ITensor>& zerop,
-                  const ov::SoPtr<ov::ITensor>& scale,
-                  const ov::SoPtr<ov::ITensor>& to,
-                  const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(zerop->is_continuous());
-    NPUW_ASSERT(scale->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-
-    // Only single-size ZP is supported
-    NPUW_ASSERT(zerop->get_size() == 1);
-
-    const auto& from_shape = from->get_shape();
-    NPUW_ASSERT(from_shape.back() % 64 == 0);
-
-    // 2-channel (Symmetric) and 3-channel (group-wise)
-    // scale factors are supported. The scale/value loop
-    // iteration is based on stotal, so should work for
-    // both cases.
-    const auto& scale_shape = scale->get_shape();
-    NPUW_ASSERT(scale_shape.size() == 3 || scale_shape.size() == 2);
-    if (scale_shape.size() == 3) {
-        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
-        NPUW_ASSERT(scale_shape[1] == from_shape[1]);
-        NPUW_ASSERT(scale_shape[2] == 1);
-    } else {
-        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
-        NPUW_ASSERT(scale_shape[1] == 1);
-    }
-
-    const auto zerop_elem_type = zerop->get_element_type();
-    const auto scale_elem_type = scale->get_element_type();
-    NPUW_ASSERT(zerop_elem_type == ov::element::u4);
-    NPUW_ASSERT(scale_elem_type == ov::element::f16);
-
-    // This conversion combines u4tof32 and f32tof16. Here we
-    // - read    256  bits (= 32  bytes, = 64  u4  elements)
-    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
-    // per every iteration, what translates to (from->size() / 64) iterations
-
-    const std::size_t total = to->get_size();
-    const std::size_t stotal = scale->get_size();
-    const std::size_t elementsPerScale = total / stotal;
-
-    const uint8_t* const pSrc = static_cast<uint8_t*>(from->data());   // 2 x u4  elements
-    const uint8_t* const pZer = static_cast<uint8_t*>(zerop->data());  // 1 x u4  element
-    const int8_t* const pScl = static_cast<int8_t*>(scale->data());    // 1 x f16 element
-    const int16_t* pDst = static_cast<int16_t*>(to->data());           // 1 x f16 element
-
-    const float zval = static_cast<float>(lo4(*pZer));  // MSB - since OpenVINO 24.0!
-
-    __m256 zvalVec = _mm256_set1_ps(zval);
-
-    auto unpack_body = [pSrc, pDst, pScl, zvalVec, elementsPerScale, scale_elem_type, stotal](std::size_t sindex,
-                                                                                              std::size_t stride) {
-        // number of vectorized operations per scale
-        size_t elementsPerScaleVectorized = elementsPerScale / 64;
-
-        uint8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride;
-        int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride;
-        int16_t* pDstLocal = const_cast<int16_t*>(pDst) + 64 * elementsPerScaleVectorized * sindex * stride;
-
-        // if it is last iteration current stride can be smaller - lets check that
-        sindex *= stride;
-        const auto jobFinish = std::min(sindex + stride, stotal);
-
-        for (; sindex < jobFinish; sindex++) {
-            __m256 svalVec = avx2_load_scale(pSclLocal, scale_elem_type);
-
-            for (std::size_t index = 0; index < elementsPerScale; index += 64) {
-                __m128i* outv[] = {
-                    reinterpret_cast<__m128i*>(pDstLocal),
-                    reinterpret_cast<__m128i*>(pDstLocal + 8),
-                    reinterpret_cast<__m128i*>(pDstLocal + 16),
-                    reinterpret_cast<__m128i*>(pDstLocal + 24),
-                    reinterpret_cast<__m128i*>(pDstLocal + 32),
-                    reinterpret_cast<__m128i*>(pDstLocal + 40),
-                    reinterpret_cast<__m128i*>(pDstLocal + 48),
-                    reinterpret_cast<__m128i*>(pDstLocal + 56),
-                };
-                __m256i himask = _mm256_set1_epi8(static_cast<char>(0xF0));
-                __m256i lomask = _mm256_set1_epi8(static_cast<char>(0x0F));
-
-                // loading 256 bit u4 into unalligned memory , so 64 elements
-                // cannot use aligned version here like _mm256_load_si256 - segfault even on unit tests
-                __m256i xmmData = _mm256_lddqu_si256(reinterpret_cast<__m256i const*>(pSrcLocal));
-
-                // unpacking with interleaving
-                __m256i vht = _mm256_and_si256(xmmData, himask);
-                __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4);          // 32 x i8
-                __m256i xmmUnpackedHi = _mm256_and_si256(xmmData, lomask);  // 32 x i8
-
-                // need 4 portions of 8 x i8 elements
-                __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo);       //  lower  16 x i8
-                __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1);  //  higher 16 x i8
-
-                __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi);       //  lower  16 x i8
-                __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1);  //  higher 16 x i8
-
-                // converting to 32 x f16
-                __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, svalVec),
-                                     avx2_u8tof16_lo(unpacked32LoLo, zvalVec, svalVec)};
-
-                __m128i f16LoHi[] = {
-                    avx2_u8tof16_hi(unpacked32LoHi, zvalVec, svalVec),
-                    avx2_u8tof16_lo(unpacked32LoHi, zvalVec, svalVec),
-                };
-
-                __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, svalVec),
-                                     avx2_u8tof16_lo(unpacked32HiLo, zvalVec, svalVec)};
-                __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, svalVec),
-                                     avx2_u8tof16_lo(unpacked32HiHi, zvalVec, svalVec)};
-
-                // interleaving back
-                __m128i interleaved[] = {_mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]),
-                                         _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]),
-                                         _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]),
-                                         _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]),
-                                         _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]),
-                                         _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]),
-                                         _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]),
-                                         _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1])};
-
-                // store the results
-                _mm_storeu_si128(outv[0], interleaved[0]);
-                _mm_storeu_si128(outv[1], interleaved[1]);
-                _mm_storeu_si128(outv[2], interleaved[2]);
-                _mm_storeu_si128(outv[3], interleaved[3]);
-                _mm_storeu_si128(outv[4], interleaved[4]);
-                _mm_storeu_si128(outv[5], interleaved[5]);
-                _mm_storeu_si128(outv[6], interleaved[6]);
-                _mm_storeu_si128(outv[7], interleaved[7]);
-
-                pSrcLocal += 32;  // shift pSrc only by 32 since it is 64 x u4
-                pDstLocal += 64;  // note pDst is int16_t, so 64 x f16 -> 64 elements
-            }                     // for(index)
-            pSclLocal += scale_elem_type.size();
-        }  // for(sindex)
-    };
-
-    size_t stride{1};
-
-    // since scaling is always 64 elements aligned operations, lets partition only in scale shape
-    if (unpack_options.nPartitions) {
-        std::size_t minPartitions;
-        if (!unpack_options.bStrictPartitioning) {
-            // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least,
-            // so in terms of stride, it should be nElementsPerscale/64 * 2048
-            const auto nIntrinsicsPerScale = elementsPerScale / 64u;
-            auto minScaleStride = 2048u / nIntrinsicsPerScale;
-            minScaleStride = std::max<std::size_t>(1u, minScaleStride);
-            minPartitions = stotal / minScaleStride;
-            minPartitions = std::max<std::size_t>(1u, minPartitions);
-            minPartitions = std::min(minPartitions, unpack_options.nPartitions);
-        } else {
-            minPartitions = unpack_options.nPartitions;
-        }
-
-        // calculating stride in scale elements space
-        stride = static_cast<size_t>(stotal / minPartitions);
-    }
-
-    const size_t numWork = (stotal + stride - 1) / stride;
-
-    if (unpack_options.bUseOvParallelFor) {
-        ov::parallel_for(numWork, [unpack_body, stride](size_t index) {
-            unpack_body(index, stride);
-        });
-    } else {
-        for (std::size_t index = 0; index < numWork; index++) {
-            unpack_body(index, stride);
-        }
-    }
-}
-
-void unpack_u4f16_asymm_zp(const ov::SoPtr<ov::ITensor>& from,
-                           const ov::SoPtr<ov::ITensor>& zerop,
-                           const ov::SoPtr<ov::ITensor>& scale,
-                           const ov::SoPtr<ov::ITensor>& to,
-                           const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(zerop->is_continuous());
-    NPUW_ASSERT(scale->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-
-    const auto& from_shape = from->get_shape();
-    NPUW_ASSERT(from_shape.back() % 64 == 0);
-
-    // 3-channel (group-wise) scale factors are
-    // supported.
-
-    const auto& scale_shape = scale->get_shape();
-    NPUW_ASSERT(scale_shape.size() == 3);
-    if (scale_shape.size() == 3) {
-        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
-        NPUW_ASSERT(scale_shape[1] == from_shape[1]);
-        NPUW_ASSERT(scale_shape[2] == 1);
-    }
-
-    const auto& zerop_shape = zerop->get_shape();
-    NPUW_ASSERT(zerop_shape.size() == 3);
-    if (zerop_shape.size() == 3) {
-        NPUW_ASSERT(zerop_shape[0] == from_shape[0]);
-        NPUW_ASSERT(zerop_shape[1] == from_shape[1]);
-        NPUW_ASSERT(zerop_shape[2] == 1);
-    }
-
-    const auto zerop_elem_type = zerop->get_element_type();
-    const auto scale_elem_type = scale->get_element_type();
-    NPUW_ASSERT(zerop_elem_type == ov::element::u4);
-    NPUW_ASSERT(scale_elem_type == ov::element::f16);
-
-    // This conversion combines u4tof32 and f32tof16. Here we
-    // - read    256  bits (= 32  bytes, = 64  u4  elements)
-    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
-    // per every iteration, what translates to (from->size() / 64) iterations
-
-    const std::size_t total = to->get_size();
-    const std::size_t stotal = scale->get_size();
-    const std::size_t elementsPerScale = total / stotal;
-
-    const uint8_t* const pSrc = static_cast<uint8_t*>(from->data());   // 2 x u4  elements
-    const uint8_t* const pZer = static_cast<uint8_t*>(zerop->data());  // 2 x u4  element
-    const int8_t* const pScl = static_cast<int8_t*>(scale->data());    // 1 x f16 element
-    const int16_t* pDst = static_cast<int16_t*>(to->data());           // 1 x f16 element
-
-    auto unpack_body = [pSrc, pDst, pScl, pZer, elementsPerScale, scale_elem_type, zerop_elem_type, stotal](
-                           std::size_t sindex,
-                           std::size_t stride) {
-        // number of vectorized operations per scale
-        size_t elementsPerScaleVectorized = elementsPerScale / 64;
-
-        uint8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride;
-        int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride;
-        uint8_t const* pZerLocal = pZer + zerop_elem_type.size() * sindex * stride / 2;
-        int16_t* pDstLocal = const_cast<int16_t*>(pDst) + 64 * elementsPerScaleVectorized * sindex * stride;
-
-        // if it is last iteration current stride can be smaller - lets check that
-        sindex *= stride;
-        const auto jobFinish = std::min(sindex + stride, stotal);
-
-        for (; sindex < jobFinish; sindex++) {
-            __m256 svalVec = avx2_load_scale(pSclLocal, scale_elem_type);
-            __m256 zvalVec = _mm256_set1_ps(static_cast<float>((sindex % 2 == 0) ? lo4(*pZerLocal) : hi4(*pZerLocal)));
-
-            for (std::size_t index = 0; index < elementsPerScale; index += 64) {
-                __m128i* outv[] = {
-                    reinterpret_cast<__m128i*>(pDstLocal),
-                    reinterpret_cast<__m128i*>(pDstLocal + 8),
-                    reinterpret_cast<__m128i*>(pDstLocal + 16),
-                    reinterpret_cast<__m128i*>(pDstLocal + 24),
-                    reinterpret_cast<__m128i*>(pDstLocal + 32),
-                    reinterpret_cast<__m128i*>(pDstLocal + 40),
-                    reinterpret_cast<__m128i*>(pDstLocal + 48),
-                    reinterpret_cast<__m128i*>(pDstLocal + 56),
-                };
-                __m256i himask = _mm256_set1_epi8(static_cast<char>(0xF0));
-                __m256i lomask = _mm256_set1_epi8(static_cast<char>(0x0F));
-
-                // loading 256 bit u4 into unalligned memory , so 64 elements
-                // cannot use aligned version here like _mm256_load_si256 - segfault even on unit tests
-                __m256i xmmData = _mm256_lddqu_si256(reinterpret_cast<__m256i const*>(pSrcLocal));
-
-                // unpacking with interleaving
-                __m256i vht = _mm256_and_si256(xmmData, himask);
-                __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4);          // 32 x i8
-                __m256i xmmUnpackedHi = _mm256_and_si256(xmmData, lomask);  // 32 x i8
-
-                // need 4 portions of 8 x i8 elements
-                __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo);       //  lower  16 x i8
-                __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1);  //  higher 16 x i8
-
-                __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi);       //  lower  16 x i8
-                __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1);  //  higher 16 x i8
-
-                // converting to 32 x f16
-                __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, svalVec),
-                                     avx2_u8tof16_lo(unpacked32LoLo, zvalVec, svalVec)};
-
-                __m128i f16LoHi[] = {
-                    avx2_u8tof16_hi(unpacked32LoHi, zvalVec, svalVec),
-                    avx2_u8tof16_lo(unpacked32LoHi, zvalVec, svalVec),
-                };
-
-                __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, svalVec),
-                                     avx2_u8tof16_lo(unpacked32HiLo, zvalVec, svalVec)};
-                __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, svalVec),
-                                     avx2_u8tof16_lo(unpacked32HiHi, zvalVec, svalVec)};
-
-                // interleaving back
-                __m128i interleaved[] = {_mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]),
-                                         _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]),
-                                         _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]),
-                                         _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]),
-                                         _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]),
-                                         _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]),
-                                         _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]),
-                                         _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1])};
-
-                // store the results
-                _mm_storeu_si128(outv[0], interleaved[0]);
-                _mm_storeu_si128(outv[1], interleaved[1]);
-                _mm_storeu_si128(outv[2], interleaved[2]);
-                _mm_storeu_si128(outv[3], interleaved[3]);
-                _mm_storeu_si128(outv[4], interleaved[4]);
-                _mm_storeu_si128(outv[5], interleaved[5]);
-                _mm_storeu_si128(outv[6], interleaved[6]);
-                _mm_storeu_si128(outv[7], interleaved[7]);
-
-                pSrcLocal += 32;  // shift pSrc only by 32 since it is 64 x u4
-                pDstLocal += 64;  // note pDst is int16_t, so 64 x f16 -> 64 elements
-            }                     // for(index)
-            pSclLocal += scale_elem_type.size();
-            if (sindex % 2 == 1) {
-                pZerLocal += zerop_elem_type.size();
-            }
-        }  // for(sindex)
-    };
-
-    size_t stride{1};
-
-    // since scaling is always 64 elements aligned operations, lets partition only in scale shape
-    if (unpack_options.nPartitions) {
-        std::size_t minPartitions;
-        if (!unpack_options.bStrictPartitioning) {
-            // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least,
-            // so in terms of stride, it should be nElementsPerscale/64 * 2048
-            const auto nIntrinsicsPerScale = elementsPerScale / 64u;
-            auto minScaleStride = 2048u / nIntrinsicsPerScale;
-            minScaleStride = std::max<std::size_t>(1u, minScaleStride);
-            minPartitions = stotal / minScaleStride;
-            minPartitions = std::max<std::size_t>(1u, minPartitions);
-            minPartitions = std::min(minPartitions, unpack_options.nPartitions);
-        } else {
-            minPartitions = unpack_options.nPartitions;
-        }
-
-        // calculating stride in scale elements space
-        stride = static_cast<size_t>(stotal / minPartitions);
-    }
-
-    const size_t numWork = (stotal + stride - 1) / stride;
-
-    if (unpack_options.bUseOvParallelFor) {
-        ov::parallel_for(numWork, [unpack_body, stride](size_t index) {
-            unpack_body(index, stride);
-        });
-    } else {
-        for (std::size_t index = 0; index < numWork; index++) {
-            unpack_body(index, stride);
-        }
-    }
-}
-
-void unpack_u4f16_z(const ov::SoPtr<ov::ITensor>& from,
-                    const ov::SoPtr<ov::ITensor>& zerop,
-                    const ov::SoPtr<ov::ITensor>& scale,
-                    const ov::SoPtr<ov::ITensor>& to,
-                    const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(zerop->is_continuous());
-    NPUW_ASSERT(scale->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-
-    // Only single-size ZP is supported
-    NPUW_ASSERT(zerop->get_size() == 1);
-
-    const auto& from_shape = from->get_shape();
-    NPUW_ASSERT(from_shape.back() % 64 == 0);
-
-    const auto& scale_shape = scale->get_shape();
-    NPUW_ASSERT(scale_shape.size() == 3);
-    NPUW_ASSERT(scale_shape[0] == from_shape[0]);
-    NPUW_ASSERT(scale_shape[2] == from_shape[2]);
-    NPUW_ASSERT(scale_shape[1] == 1);
-
-    const auto zerop_elem_type = zerop->get_element_type();
-    const auto scale_elem_type = scale->get_element_type();
-    NPUW_ASSERT(zerop_elem_type == ov::element::f32);
-    NPUW_ASSERT(scale_elem_type == ov::element::f32);
-
-    // This conversion combines u4tof32 and f32tof16. Here we
-    // - read    256  bits (= 32  bytes, = 64  u4  elements)
-    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
-    // per every iteration, what translates to (from->size() / 64) iterations
-
-    const size_t C = from_shape[from_shape.size() - 3];
-    const size_t H = from_shape[from_shape.size() - 2];
-    const size_t W = from_shape[from_shape.size() - 1];
-
-    const uint8_t* const pSrc = static_cast<uint8_t*>(from->data());  // 2 x u4  elements
-    const float* const pScl = static_cast<float*>(scale->data());     // 1 x f32 element
-    int16_t* pDst = static_cast<int16_t*>(to->data());                // 1 x f16 element
-
-    const float zval = avx2_load_f32(reinterpret_cast<const int8_t*>(zerop->data()), zerop_elem_type);
-    __m256 zvalVec = _mm256_set1_ps(zval);
-
-    auto unpack_body = [&](size_t job_index, size_t stride) {
-        size_t start_c = job_index * stride;
-        size_t end_c = std::min(C, start_c + stride);
-
-        for (size_t c = start_c; c < end_c; ++c) {
-            for (size_t h = 0; h < H; ++h) {
-                for (size_t w = 0; w < W; w += 64) {
-                    const uint8_t* pSrc_iter = pSrc + (w + W * h + W * H * c) / 2;
-                    __m256i vinput = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrc_iter));
-                    const float* pScl_iter = pScl + w + W * c;
-                    int16_t* pDst_iter = pDst + w + W * h + W * H * c;
-
-                    __m256 svalVec[8];
-                    for (int i = 0; i < 8; ++i) {
-                        svalVec[i] = _mm256_loadu_ps(pScl_iter + i * 8);
-                    }
-
-                    // vectorized unpack u4 to f16
-                    __m128i htmp[8];  // 64 x f16
-                    avx2_u4tof16(vinput, htmp, zvalVec, svalVec);
-
-                    for (int i = 0; i < 8; ++i) {
-                        _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst_iter + i * 8), htmp[i]);
-                    }
-                }
-            }
-        }
-    };
-
-    size_t stride = C;
-    size_t num_jobs = 1;
-
-    if (unpack_options.nPartitions) {
-        if (unpack_options.bStrictPartitioning) {
-            stride = (C + unpack_options.nPartitions - 1) / unpack_options.nPartitions;
-            num_jobs = unpack_options.nPartitions;
-        } else {
-            stride = std::max<size_t>(1, C / unpack_options.nPartitions);
-            num_jobs = (C + stride - 1) / stride;
-        }
-    }
-
-    if (unpack_options.bUseOvParallelFor) {
-        ov::parallel_for(num_jobs, [&](size_t job_index) {
-            unpack_body(job_index, stride);
-        });
-    } else {
-        for (size_t job_index = 0; job_index < num_jobs; ++job_index) {
-            unpack_body(job_index, stride);
-        }
-    }
-}
-
-void unpack_u4f32(const ov::SoPtr<ov::ITensor>& from,
-                  const ov::SoPtr<ov::ITensor>& to,
-                  const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-
-    uint8_t const* pSrc = static_cast<uint8_t*>(from->data());  // 2 x u4 elements
-    float* pDst = static_cast<float*>(to->data());              // 1 x f32 element
-
-    const std::size_t total = from->get_size();
-    for (std::size_t index = 0; index < total; index += 2) {
-        pDst[0] = static_cast<float>(lo4(*pSrc));  // LSB is [0] - since OpenVINO 2024.0!
-        pDst[1] = static_cast<float>(hi4(*pSrc));  // MSB is [1] - since OpenVINO 2024.0!
-        pSrc++;
-        pDst += 2;
-    }
-}
-
-void unpack_i8f16(const ov::SoPtr<ov::ITensor>& from,
-                  const ov::SoPtr<ov::ITensor>& to,
-                  const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-    NPUW_ASSERT(from->get_size() % 8 == 0);
-
-    constexpr std::size_t VECSIZE = 8;
-
-    const std::size_t total = from->get_size();
-    int8_t const* pSrc = from->data<int8_t>();
-    int16_t* pDst = static_cast<int16_t*>(to->data());
-
-    for (std::size_t index = 0; index < total; index += VECSIZE) {
-        const __m128i* pSrcV = reinterpret_cast<const __m128i*>(pSrc);
-        __m128i* pDstV = reinterpret_cast<__m128i*>(pDst);
-        __m128i i8vec = _mm_loadl_epi64(pSrcV);  // load:    8 x i8  [ 64b of 128b]
-        __m128i f16vec = avx2_i8tof16(i8vec);
-        _mm_store_si128(pDstV, f16vec);  // store:   8 x f16 [128b]
-        pSrc += 8;
-        pDst += 8;
-    }
-}
-
-void unpack_i8f16(const ov::SoPtr<ov::ITensor>& from,
-                  const ov::SoPtr<ov::ITensor>& scale,
-                  const ov::SoPtr<ov::ITensor>& to,
-                  const ov::npuw::util::UnpackOptions& unpack_options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(scale->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-    NPUW_ASSERT(from->get_size() % 8 == 0);
-    NPUW_ASSERT(scale->get_shape()[0] == from->get_shape()[0]);
-    NPUW_ASSERT(scale->get_shape()[1] == 1);
-
-    const auto scale_elem_type = scale->get_element_type();
-    NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16);
-
-    constexpr std::size_t VECSIZE = 8;
-
-    const std::size_t total = from->get_size();
-    const std::size_t stotal = scale->get_size();
-    int8_t const* pSrc = from->data<int8_t>();
-    int8_t const* pScl = static_cast<int8_t*>(scale->data());
-    int16_t* pDst = static_cast<int16_t*>(to->data());
-
-    for (std::size_t sindex = 0u; sindex < stotal; sindex++) {
-        __m256 svec = avx2_load_scale(pScl, scale_elem_type);
-        for (std::size_t index = 0u; index < (total / stotal); index += VECSIZE) {
-            __m128i const* pSrcV = reinterpret_cast<const __m128i*>(pSrc);
-            __m128i* pDstV = reinterpret_cast<__m128i*>(pDst);
-            __m128i i8vec = _mm_loadl_epi64(pSrcV);      // load:    8 x i8  [ 64b of 128b]
-            __m128i f16vec = avx2_i8tof16(i8vec, svec);  // convert & scale
-            _mm_store_si128(pDstV, f16vec);              // store:   8 x f16 [128b]
-            pSrc += 8;
-            pDst += 8;
-        }  // index
-        pScl += scale_elem_type.size();
-    }  // sindex
-}
-
-void unpack_u8f16(const ov::SoPtr<ov::ITensor>& from,
-                  const ov::SoPtr<ov::ITensor>& zerop,
-                  const ov::SoPtr<ov::ITensor>& scale,
-                  const ov::SoPtr<ov::ITensor>& to,
-                  const ov::npuw::util::UnpackOptions& _options) {
-    NPUW_ASSERT(from->is_continuous());
-    NPUW_ASSERT(zerop->is_continuous());
-    NPUW_ASSERT(scale->is_continuous());
-    NPUW_ASSERT(to->is_continuous());
-    NPUW_ASSERT(from->get_size() == to->get_size());
-    NPUW_ASSERT(from->get_size() % 8 == 0);
-    NPUW_ASSERT(scale->get_shape()[0] == from->get_shape()[0]);
-    NPUW_ASSERT(scale->get_shape()[1] == 1);
-    NPUW_ASSERT(zerop->get_shape()[0] == from->get_shape()[0]);
-    NPUW_ASSERT(zerop->get_shape()[1] == 1);
-
-    const auto scale_elem_type = scale->get_element_type();
-    NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16);
-
-    const auto zerop_elem_type = zerop->get_element_type();
-    NPUW_ASSERT(zerop_elem_type == ov::element::u8);
-
-    constexpr std::size_t VECSIZE = 8;
-
-    const std::size_t total = from->get_size();
-    const std::size_t stotal = scale->get_size();
-    uint8_t const* pSrc = from->data<uint8_t>();
-    uint8_t const* pZrp = zerop->data<uint8_t>();
-    int8_t const* pScl = static_cast<int8_t*>(scale->data());
-    int16_t* pDst = static_cast<int16_t*>(to->data());
-
-    for (std::size_t sindex = 0u; sindex < stotal; sindex++) {
-        __m256 svec = avx2_load_scale(pScl, scale_elem_type);
-        __m128i u8zp = _mm_set1_epi8(*pZrp);         // bcast:   8 x u8
-        __m256i u32zp = _mm256_cvtepu8_epi32(u8zp);  // i32 zero point
-        __m256 f32zp = _mm256_cvtepi32_ps(u32zp);    // f32 zero point
-        for (std::size_t index = 0u; index < (total / stotal); index += VECSIZE) {
-            __m128i const* pSrcV = reinterpret_cast<const __m128i*>(pSrc);
-            __m128i* pDstV = reinterpret_cast<__m128i*>(pDst);
-            __m128i u8in = _mm_loadl_epi64(pSrcV);             // load:    8 x u8
-            __m128i f16vec = avx2_u8tof16(u8in, f32zp, svec);  // convert & scale
-            _mm_store_si128(pDstV, f16vec);                    // store:   8 x f16
-            pSrc += VECSIZE;
-            pDst += VECSIZE;
-        }  // index
-        pScl += scale_elem_type.size();
-        pZrp++;
-    }  // sindex
-}
-
-}  // namespace
-
 void ov::npuw::util::unpack(const ov::SoPtr<ov::ITensor>& from,
                             const ov::SoPtr<ov::ITensor>& to,
                             const UnpackOptions& unpack_options) {
@@ -1411,9 +76,9 @@ void ov::npuw::util::unpack(const ov::SoPtr<ov::ITensor>& from,
     namespace ove = ov::element;
 #define CAST(x)    static_cast<int>((x).operator ove::Type_t())
 #define PAIR(f, t) (CAST(f) << 16 | CAST(t))
-#define HNDL(f, t)                               \
-    case PAIR(ove::f, ove::t):                   \
-        unpack_##f##t(from, to, unpack_options); \
+#define HNDL(f, t)                                                      \
+    case PAIR(ove::f, ove::t):                                          \
+        ov::npuw::util::XARCH::unpack_##f##t(from, to, unpack_options); \
         break;
     switch (PAIR(type_from, type_to)) {
         HNDL(i4, i8);
@@ -1445,16 +110,16 @@ void ov::npuw::util::unpack(const ov::SoPtr<ov::ITensor>& from,
     if (type_from == ov::element::i4) {
         if (from_shape.size() == 3) {
             if (scale_shape[2] == from_shape[2]) {
-                unpack_i4f16_z(from, scale, to, unpack_options);
+                ov::npuw::util::XARCH::unpack_i4f16_z(from, scale, to, unpack_options);
             } else {
-                unpack_i4f16(from, scale, to, unpack_options);
+                ov::npuw::util::XARCH::unpack_i4f16_scale(from, scale, to, unpack_options);
             }
         } else {
             NPUW_ASSERT(from_shape.size() == 2);
-            unpack_i4f16(from, scale, to, unpack_options);
+            ov::npuw::util::XARCH::unpack_i4f16_scale(from, scale, to, unpack_options);
         }
     } else if (type_from == ov::element::i8) {
-        unpack_i8f16(from, scale, to, unpack_options);
+        ov::npuw::util::XARCH::unpack_i8f16_scale(from, scale, to, unpack_options);
     } else {
         NPUW_ASSERT(false && "Unsupported combination");
     }
@@ -1507,23 +172,23 @@ void ov::npuw::util::unpack(const ov::SoPtr<ov::ITensor>& from,
     if (type_from == ov::element::u4) {
         if (scale_shape.size() == 3 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1 &&
             scale_shape[2] == from_shape[2]) {
-            unpack_u4f16_z(from, zerop, scale, to, unpack_options);
+            ov::npuw::util::XARCH::unpack_u4f16_z(from, zerop, scale, to, unpack_options);
         } else if (scale_shape.size() == 3 && scale_shape[0] == from_shape[0] && scale_shape[1] == from_shape[1] &&
                    scale_shape[2] == 1) {
             if (zerop->get_size() == 1) {
-                unpack_u4f16(from, zerop, scale, to, unpack_options);
+                ov::npuw::util::XARCH::unpack_u4f16_scale_zp(from, zerop, scale, to, unpack_options);
             } else {
-                unpack_u4f16_asymm_zp(from, zerop, scale, to, unpack_options);
+                ov::npuw::util::XARCH::unpack_u4f16_asymm_zp(from, zerop, scale, to, unpack_options);
             }
         } else if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) {
-            unpack_u4f16(from, zerop, scale, to, unpack_options);
+            ov::npuw::util::XARCH::unpack_u4f16_scale_zp(from, zerop, scale, to, unpack_options);
         } else {
             NPUW_ASSERT(false);
         }
     } else if (type_from == ov::element::u8) {
         // Only support CW for now
         if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) {
-            unpack_u8f16(from, zerop, scale, to, unpack_options);
+            ov::npuw::util::XARCH::unpack_u8f16(from, zerop, scale, to, unpack_options);
         } else {
             NPUW_ASSERT(false);
         }
@@ -1667,26 +332,7 @@ void ov::npuw::util::to_f32(const ov::Tensor& in, ov::Tensor& out) {
 }
 
 ov::Tensor ov::npuw::util::to_f16(const ov::Tensor& t) {
-    ov::Shape shape = t.get_shape();
-    NPUW_ASSERT(t.get_element_type() == ov::element::f32);
-    NPUW_ASSERT(t.get_size() % 8 == 0);
-    NPUW_ASSERT(t.is_continuous());
-
-    ov::Tensor tnew(ov::element::f16, shape);
-
-    const float* psrc = t.data<float>();
-    uint8_t* pdst = static_cast<uint8_t*>(tnew.data());
-
-    for (std::size_t i = 0; i < t.get_size() / 8; i++) {
-        __m256 vsrc = _mm256_loadu_ps(psrc);
-        __m128i vout = _mm256_cvtps_ph(vsrc, _MM_FROUND_TO_NEAREST_INT);
-        __m128i* pout = reinterpret_cast<__m128i*>(pdst);
-        _mm_storeu_si128(pout, vout);
-        psrc += 8;        // offset in sizeof(float)
-        pdst += (8 * 2);  // offset in bytes
-    }
-
-    return tnew;
+    return ov::npuw::util::XARCH::to_f16(t);
 }
 
 inline uint8_t tread_4b(const ov::Tensor& t, std::size_t r, std::size_t c, std::size_t COLS) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp
new file mode 100644
index 00000000000000..37c4770b9d9fa3
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp
@@ -0,0 +1,1429 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#if defined(HAVE_AVX2)
+#    include <immintrin.h>
+#endif
+
+#include <openvino/core/parallel.hpp>
+
+#include "util.hpp"
+#include "util_xarch.hpp"
+
+#ifdef UNPACK_PROFILING
+#    include "tbb/concurrent_unordered_map.h"
+#endif
+
+namespace {
+#if defined(HAVE_AVX2)
+inline int8_t hi4(int8_t x) {
+    return ((x & (1 << 7)) >> 4) | ((x & (1 << 6)) >> 4) | ((x & (1 << 5)) >> 4) | ((x & (1 << 4)) >> 4);
+}
+
+inline int8_t lo4(int8_t x) {
+    return (x & (1 << 3)) | (x & (1 << 2)) | (x & (1 << 1)) | (x & (1 << 0));
+}
+#endif
+
+inline uint8_t hi4(uint8_t x) {
+    return x >> 4;
+}
+
+inline uint8_t lo4(uint8_t x) {
+    return x & 0xF;
+}
+
+#if defined(HAVE_AVX2)
+inline int8_t upc(int8_t h) {
+    return h | (-((h & (1 << 3)) >> 3) & (-8));
+}
+
+// NOTE: This routine implements the NEW ORDER
+#    define avx2_i4toi8(vinput, vout0, vout1)                                         \
+        {                                                                             \
+            __m256i himask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0xF0));    \
+            __m256i lomask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0x0F));    \
+            __m256i vsgmask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 1 << 3)); \
+            __m256i vzero = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0));        \
+            __m256i vextend = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, (-8)));   \
+                                                                                      \
+            __m256i vht = _mm256_and_si256(vinput, himask);                           \
+            __m256i vhi = _mm256_srli_epi16(vht, 4);                                  \
+            __m256i vlo = _mm256_and_si256(vinput, lomask);                           \
+                                                                                      \
+            __m256i vsghi = _mm256_srli_epi16(_mm256_and_si256(vhi, vsgmask), 3);     \
+            __m256i vsglo = _mm256_srli_epi16(_mm256_and_si256(vlo, vsgmask), 3);     \
+            __m256i vsubhi = _mm256_sub_epi8(vzero, vsghi);                           \
+            __m256i vsublo = _mm256_sub_epi8(vzero, vsglo);                           \
+            __m256i vhires = _mm256_or_si256(vhi, _mm256_and_si256(vsubhi, vextend)); \
+            __m256i vlores = _mm256_or_si256(vlo, _mm256_and_si256(vsublo, vextend)); \
+                                                                                      \
+            __m256i vunlo = _mm256_unpacklo_epi8(vlores, vhires);                     \
+            __m256i vunhi = _mm256_unpackhi_epi8(vlores, vhires);                     \
+            *vout0 = _mm256_permute2x128_si256(vunlo, vunhi, 0x20);                   \
+            *vout1 = _mm256_permute2x128_si256(vunlo, vunhi, 0x31);                   \
+        }
+
+inline __m128i avx2_i8tof16(__m128i vi8) {
+    __m256i i32vec = _mm256_cvtepi8_epi32(vi8);                 // extend:  8 x i8  -> 8 x i32 [256b of 256b]
+    __m256 f32vec = _mm256_cvtepi32_ps(i32vec);                 // convert: 8 x i32 -> 8 x f32 [256b of 256b]
+    return _mm256_cvtps_ph(f32vec, _MM_FROUND_TO_NEAREST_INT);  // convert: 8 x f32 -> 8 x f16 [128b]
+}
+
+inline __m128i avx2_i8tof16(__m128i vi8, __m256 s) {
+    __m256i i32vec = _mm256_cvtepi8_epi32(vi8);                 // extend:  8 x i8  -> 8 x i32 [256b of 256b]
+    __m256 f32vec = _mm256_cvtepi32_ps(i32vec);                 // convert: 8 x i32 -> 8 x f32 [256b of 256b]
+    __m256 f32scl = _mm256_mul_ps(f32vec, s);                   // scale:   8 x f32 -> 8 x f32 [256b of 256b]
+    return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT);  // convert: 8 x f32 -> 8 x f16 [128b]
+}
+
+inline __m128i avx2_u8tof16_hi(__m128i vu8, __m256 z, __m256 s) {
+    __m256i u32vec = _mm256_cvtepu8_epi32(vu8);                 // extend:   8 x u8  -> 8 x i32 [256b of 256b]
+    __m256 f32vec = _mm256_cvtepi32_ps(u32vec);                 // convert:  8 x i32 -> 8 x f32 [256b of 256b]
+    __m256 f32sub = _mm256_sub_ps(f32vec, z);                   // subtract: 8 x f32 -> 8 x f32 [256b of 256b]
+    __m256 f32scl = _mm256_mul_ps(f32sub, s);                   // scale:    8 x f32 -> 8 x f32 [256b of 256b]
+    return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT);  // convert: 8 x f32 -> 8 x f16 [128b]
+}
+
+inline __m128i avx2_u8tof16_lo(__m128i vu8, __m256 z, __m256 s) {
+    __m128i vu8h = _mm_bsrli_si128(vu8, 8);
+    return avx2_u8tof16_hi(vu8h, z, s);
+}
+
+inline __m128i avx2_u8tof16(__m128i vi8, __m256 z, __m256 s) {
+    __m256i i32vec = _mm256_cvtepu8_epi32(vi8);                 // extend:   8 x i8  -> 8 x i32 [256b of 256b]
+    __m256 f32vec = _mm256_cvtepi32_ps(i32vec);                 // convert:  8 x i32 -> 8 x f32 [256b of 256b]
+    __m256 f32sub = _mm256_sub_ps(f32vec, z);                   // subtract: 8 x f32 -> 8 x f32 [256b of 256b]
+    __m256 f32scl = _mm256_mul_ps(f32sub, s);                   // scale:    8 x f32 -> 8 x f32 [256b of 256b]
+    return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT);  // convert: 8 x f32 -> 8 x f16 [128b]
+}
+
+// NOTE: This routine implements the NEW ORDER
+inline void avx2_u4tof16(__m256i vinput, __m128i vout[8], __m256 zvalVec, __m256 svalVec[8]) {
+    // vinput -  64       x u4  elements - 256 bits
+    // vout[]  - 64 (8x8) x f16 elements
+
+    // NOTE: This is largely a copy of unpack_u4f16() {{
+    __m256i himask = _mm256_set1_epi8(static_cast<char>(0xF0));
+    __m256i lomask = _mm256_set1_epi8(static_cast<char>(0x0F));
+
+    // unpacking with interleaving
+    __m256i vht = _mm256_and_si256(vinput, himask);
+    __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4);         // 32 x i8 - Extracting High Nibbles
+    __m256i xmmUnpackedHi = _mm256_and_si256(vinput, lomask);  // 32 x i8 - Extracting Low Nibbles
+
+    // need 4 portions of 16 x i8 elements
+    __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo);       //  lower  16 x i8 - Lower 16 of High Nibbles
+    __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1);  //  higher 16 x i8 - Higher 16 of High Nibbles
+
+    __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi);       //  lower  16 x i8 - Lower 16 of Low Nibbles
+    __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1);  //  higher 16 x i8 - Higher 16 of Low Nibbles
+
+    // Rearranging of scales
+    __m256i indices = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+    // Extracting all 64 scales as per the indices specified above
+    __m256 scale_v_rearranged[] = {_mm256_permutevar8x32_ps(svalVec[0], indices),
+                                   _mm256_permutevar8x32_ps(svalVec[1], indices),
+                                   _mm256_permutevar8x32_ps(svalVec[2], indices),
+                                   _mm256_permutevar8x32_ps(svalVec[3], indices),
+                                   _mm256_permutevar8x32_ps(svalVec[4], indices),
+                                   _mm256_permutevar8x32_ps(svalVec[5], indices),
+                                   _mm256_permutevar8x32_ps(svalVec[6], indices),
+                                   _mm256_permutevar8x32_ps(svalVec[7], indices)};
+
+    // Scaling should happen like this:
+    // low_nibble[0]->scale[0], high_nibble[0]->scale[1]...low_nibble[31]->scale[60],high_nibble[31]->scale[61]
+
+    // Extracting all the even-indexed scales for the low nibbles
+    __m256 scale_v_even[] = {
+        _mm256_permute2f128_ps(scale_v_rearranged[0], scale_v_rearranged[1], 0x20),
+        _mm256_permute2f128_ps(scale_v_rearranged[2], scale_v_rearranged[3], 0x20),
+        _mm256_permute2f128_ps(scale_v_rearranged[4], scale_v_rearranged[5], 0x20),
+        _mm256_permute2f128_ps(scale_v_rearranged[6], scale_v_rearranged[7], 0x20),
+    };
+
+    // Extracting all the odd-indexed scales for the high nibbles
+    __m256 scale_v_odd[] = {
+        _mm256_permute2f128_ps(scale_v_rearranged[0], scale_v_rearranged[1], 0x31),
+        _mm256_permute2f128_ps(scale_v_rearranged[2], scale_v_rearranged[3], 0x31),
+        _mm256_permute2f128_ps(scale_v_rearranged[4], scale_v_rearranged[5], 0x31),
+        _mm256_permute2f128_ps(scale_v_rearranged[6], scale_v_rearranged[7], 0x31),
+    };
+
+    // converting to 64 x f16
+    // Higher 16 of High Nibbles
+    __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, scale_v_odd[2]),
+                         avx2_u8tof16_lo(unpacked32LoLo, zvalVec, scale_v_odd[3])};
+    // Lower 16 of High Nibbles
+    __m128i f16LoHi[] = {avx2_u8tof16_hi(unpacked32LoHi, zvalVec, scale_v_odd[0]),
+                         avx2_u8tof16_lo(unpacked32LoHi, zvalVec, scale_v_odd[1])};
+    // Higher 16 of Low Nibbles
+    __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, scale_v_even[2]),
+                         avx2_u8tof16_lo(unpacked32HiLo, zvalVec, scale_v_even[3])};
+    // Lower 16 of Low Nibbles
+    __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, scale_v_even[0]),
+                         avx2_u8tof16_lo(unpacked32HiHi, zvalVec, scale_v_even[1])};
+
+    // interleaving back:
+    // Interleaving lower 8 of low nibbles with lower 8 of high nibbles and so on
+    vout[0] = _mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]);
+    vout[1] = _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]);
+    vout[2] = _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]);
+    vout[3] = _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]);
+    vout[4] = _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]);
+    vout[5] = _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]);
+    vout[6] = _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]);
+    vout[7] = _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1]);
+}
+
+inline __m256 avx2_load_scale(const int8_t* data, ov::element::Type type) {
+    if (type == ov::element::f32) {
+        return _mm256_set1_ps(*reinterpret_cast<const float*>(data));
+    } else {
+        NPUW_ASSERT(type == ov::element::f16);
+        float val{};
+        _mm_store_ss(&val, _mm_cvtph_ps(_mm_cvtsi32_si128(*reinterpret_cast<const int16_t*>(data))));
+        return _mm256_set1_ps(val);
+    }
+}
+
+inline float avx2_load_f32(const int8_t* data, ov::element::Type type) {
+    if (type == ov::element::f32) {
+        return *reinterpret_cast<const float*>(data);
+    } else {
+        NPUW_ASSERT(type == ov::element::f16);
+        float val{};
+        _mm_store_ss(&val, _mm_cvtph_ps(_mm_cvtsi32_si128(*reinterpret_cast<const int16_t*>(data))));
+        return val;
+    }
+}
+#endif
+
+#ifdef UNPACK_PROFILING
+class UnpackStat {
+    tbb::concurrent_unordered_map<size_t, std::pair<size_t, uint64_t>> inferenceTimes;
+
+public:
+    UnpackStat() {}
+    void addRecord(size_t sz, size_t time) {
+        inferenceTimes[sz].first++;
+        inferenceTimes[sz].second += time;
+    }
+    ~UnpackStat() {
+        for (auto&& r : inferenceTimes) {
+            std::cout << "work: " << r.first  //<< ", stride: " << stride
+                      << " overall_time = " << r.second.second / 1000 << " [ms]"
+                      << " avg_atime = " << r.second.second / r.second.first << " [µs]\n";
+        }
+    }
+};
+
+static UnpackStat ustat;
+#    define UNPACK_START_TICK() std::chrono::steady_clock::time_point _begin_tick = std::chrono::steady_clock::now();
+#    define UNPACK_SAVE_TICK()                                                              \
+        std::chrono::steady_clock::time_point _end_tick = std::chrono::steady_clock::now(); \
+        ustat.addRecord(total, std::chrono::duration_cast<std::chrono::microseconds>(_end_tick - _begin_tick).count());
+#else
+#    define UNPACK_START_TICK()
+#    define UNPACK_SAVE_TICK()
+#endif
+}  // namespace
+
+void ov::npuw::util::XARCH::unpack_i4i8(const ov::SoPtr<ov::ITensor>& from,
+                                        const ov::SoPtr<ov::ITensor>& to,
+                                        const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+#if defined(HAVE_AVX2)
+    // with vectorization above, we:
+    // - read  256 bits (= 32 bytes, = 64  i4 elements)
+    // - write 512 bits (= 64 bytes, = 64  i8 elements)
+    // per every iteration, what translates to (from->size() / 64) iterations
+
+    const std::size_t total = from->get_size();
+    int8_t const* pSrc = static_cast<int8_t*>(from->data());  // 2 x i4 elements
+    int8_t* pDst = static_cast<int8_t*>(to->data());          // 1 x i8 element
+    size_t stride = 64;
+
+    auto unpack_body = [pSrc, pDst](size_t index, size_t stride) {
+        size_t halfStride = stride >> 1;
+        int8_t const* pSrcLocal = pSrc + halfStride * index;
+        int8_t* pDstLocal = pDst + stride * index;
+
+        for (size_t j = 0; j < stride; j += 64) {
+            __m256i inv = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrcLocal));
+            __m256i* outv0 = reinterpret_cast<__m256i*>(pDstLocal);
+            __m256i* outv1 = reinterpret_cast<__m256i*>(pDstLocal + 32);
+
+            __m256i vout0, vout1;
+            avx2_i4toi8(inv, &vout0, &vout1);
+
+            _mm256_storeu_si256(outv0, vout0);
+            _mm256_storeu_si256(outv1, vout1);
+
+            pSrcLocal += 32;
+            pDstLocal += 64;
+        }
+    };
+
+    // ov work index / 64
+    if (unpack_options.nPartitions) {
+        std::size_t minPartitions;
+        if (!unpack_options.bStrictPartitioning) {
+            // some heuristics that every tbb thread workload has to have 2048 elements at least,
+            // so in terms of stride, it should be 64 * 2048
+            minPartitions = total / (64 * 2048);
+            minPartitions = std::max<std::size_t>(1u, minPartitions);
+            minPartitions = std::min(minPartitions, unpack_options.nPartitions);
+        } else {
+            minPartitions = unpack_options.nPartitions;
+        }
+
+        // calculating stride in elements - this stride give us nPartitions + 1  partitions
+        stride = static_cast<size_t>(total / minPartitions);
+
+        // stride has to be 64 elements aligned to avoid gaps between workloads
+        stride = (stride >> 6) << 6;
+        // if number of partitions to large comparing to workload, min supported stride still have to be clamped to 64
+        stride = stride < 64 ? 64 : stride;
+    }
+
+    UNPACK_START_TICK();
+
+    if (unpack_options.bUseOvParallelFor) {
+        ov::parallel_for(total / stride, [unpack_body, stride](size_t index) {
+            unpack_body(index, stride);
+        });
+    } else {
+        for (std::size_t index = 0; index < total / stride; index++) {
+            unpack_body(index, stride);
+        }
+    }
+    // handle tail
+    size_t tailOffset = (static_cast<size_t>(total / stride) * stride);
+    pSrc = static_cast<int8_t*>(from->data()) + (tailOffset >> 1);
+    pDst = static_cast<int8_t*>(to->data()) + tailOffset;
+
+    for (std::size_t index = 0; index < ((total % 64) >> 1); index++) {
+        *(pDst++) = upc(lo4(*(pSrc)));
+        *(pDst++) = upc(hi4(*(pSrc)));
+        pSrc++;
+    }
+    UNPACK_SAVE_TICK();
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_u4i8(const ov::SoPtr<ov::ITensor>& from,
+                                        const ov::SoPtr<ov::ITensor>& to,
+                                        const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+    uint8_t const* pSrc = static_cast<uint8_t*>(from->data());  // 2 x u4 elements
+    int8_t* pDst = static_cast<int8_t*>(to->data());            // 1 x i8 element
+
+    const std::size_t total = from->get_size();
+    for (std::size_t index = 0; index < total; index += 2) {
+        pDst[0] = static_cast<int8_t>(lo4(*pSrc));  // LSB is [0] -- since OpenVINO 24.0!
+        pDst[1] = static_cast<int8_t>(hi4(*pSrc));  // MSB is [1] -- since OpenVINO 24.0!
+        pSrc++;
+        pDst += 2;
+    }
+}
+
+void ov::npuw::util::XARCH::unpack_i4f16(const ov::SoPtr<ov::ITensor>& from,
+                                         const ov::SoPtr<ov::ITensor>& to,
+                                         const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+#if defined(HAVE_AVX2)
+    // This conversion combines i4toi8 (above) and i8tof16 (below). Here we
+    // - read    256  bits (= 32  bytes, = 64  i4  elements)
+    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
+    // per every iteration, what translates to (from->size() / 64) iterations
+
+    std::size_t total = to->get_size();
+    int8_t const* pSrc = static_cast<int8_t*>(from->data());  // 2 x i4  elements
+    int16_t* pDst = static_cast<int16_t*>(to->data());        // 1 x f16 element
+    // bool tailOnly = total < 64;
+
+    auto unpack_body = [pSrc, pDst](size_t index) {
+        int8_t const* pSrcLocal = pSrc + 32 * index;
+        int16_t* pDstLocal = pDst + 64 * index;
+
+        __m256i inv = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrcLocal));
+        __m128i* outv[8] = {
+            reinterpret_cast<__m128i*>(pDstLocal),
+            reinterpret_cast<__m128i*>(pDstLocal + 8),
+            reinterpret_cast<__m128i*>(pDstLocal + 16),
+            reinterpret_cast<__m128i*>(pDstLocal + 24),
+            reinterpret_cast<__m128i*>(pDstLocal + 32),
+            reinterpret_cast<__m128i*>(pDstLocal + 40),
+            reinterpret_cast<__m128i*>(pDstLocal + 48),
+            reinterpret_cast<__m128i*>(pDstLocal + 56),
+        };
+
+        __m256i vout0, vout1;
+        avx2_i4toi8(inv, &vout0, &vout1);
+
+        int8_t tmp[64];  // FIXME: Avoid it
+        __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp);
+        __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32);
+        _mm256_storeu_si256(tmpv0, vout0);
+        _mm256_storeu_si256(tmpv1, vout1);
+
+        __m128i i8vecs[8] = {
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)),
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)),
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)),
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)),
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)),
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)),
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)),
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)),
+        };
+
+        __m128i vresults[8] = {avx2_i8tof16(i8vecs[0]),
+                               avx2_i8tof16(i8vecs[1]),
+                               avx2_i8tof16(i8vecs[2]),
+                               avx2_i8tof16(i8vecs[3]),
+                               avx2_i8tof16(i8vecs[4]),
+                               avx2_i8tof16(i8vecs[5]),
+                               avx2_i8tof16(i8vecs[6]),
+                               avx2_i8tof16(i8vecs[7])};
+
+        _mm_storeu_si128(outv[0], vresults[0]);
+        _mm_storeu_si128(outv[1], vresults[1]);
+        _mm_storeu_si128(outv[2], vresults[2]);
+        _mm_storeu_si128(outv[3], vresults[3]);
+        _mm_storeu_si128(outv[4], vresults[4]);
+        _mm_storeu_si128(outv[5], vresults[5]);
+        _mm_storeu_si128(outv[6], vresults[6]);
+        _mm_storeu_si128(outv[7], vresults[7]);
+    };
+
+    if (unpack_options.bUseOvParallelFor) {
+        ov::parallel_for(total / 64, [&unpack_body](size_t index) {
+            unpack_body(index);
+        });
+    } else {
+        for (std::size_t index = 0; index < total / 64; index++) {
+            unpack_body(index);
+        }
+    }
+
+    // handle tail that is < 64 elements
+    size_t tailOffset = ((total >> 6) << 6);
+    pSrc = static_cast<int8_t*>(from->data()) + (tailOffset >> 1);
+    pDst = static_cast<int16_t*>(to->data()) + tailOffset;
+
+    constexpr std::size_t VECSIZE = 8;
+
+    total = ((total % 64) >> 1);
+    int8_t unpackedToI8[VECSIZE] = {0};
+    size_t unpackedIdx = 0;
+    for (std::size_t index = 0; index < total; index++) {
+        unpackedToI8[unpackedIdx++] = upc(lo4(*(pSrc)));
+        unpackedToI8[unpackedIdx++] = upc(hi4(*(pSrc)));
+        if (unpackedIdx == VECSIZE) {
+            __m128i i8vec = _mm_loadl_epi64(reinterpret_cast<__m128i*>(unpackedToI8));
+            __m128i f16vec = avx2_i8tof16(i8vec);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst), f16vec);
+            pDst += VECSIZE;
+            unpackedIdx = 0;
+        }
+        pSrc += 1;
+    }
+
+    // handle tail that is < 8
+    if (unpackedIdx != 0) {
+        int16_t tmp[VECSIZE];
+        __m128i i8vec = _mm_loadl_epi64(reinterpret_cast<__m128i*>(unpackedToI8));
+        __m128i f16vec = avx2_i8tof16(i8vec);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), f16vec);
+        for (size_t i = 0; i != unpackedIdx; i++) {
+            pDst[i] = tmp[i];
+        }
+    }
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_i4f16_scale(const ov::SoPtr<ov::ITensor>& from,
+                                               const ov::SoPtr<ov::ITensor>& scale,
+                                               const ov::SoPtr<ov::ITensor>& to,
+                                               const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(scale->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+    const auto& from_shape = from->get_shape();
+    NPUW_ASSERT(from_shape.back() % 64 == 0);
+
+    // 2-channel (Symmetric) and 3-channel (group-wise)
+    // scale factors are supported. The scale/value loop
+    // iteration is based on stotal, so should work for
+    // both cases.
+    const auto& scale_shape = scale->get_shape();
+    NPUW_ASSERT(scale_shape.size() == 3 || scale_shape.size() == 2);
+    if (scale_shape.size() == 3) {
+        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
+        NPUW_ASSERT(scale_shape[1] == from_shape[1]);
+        NPUW_ASSERT(scale_shape[2] == 1);
+    } else {
+        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
+        NPUW_ASSERT(scale_shape[1] == 1);
+    }
+
+    const auto scale_elem_type = scale->get_element_type();
+    NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16);
+
+#if defined(HAVE_AVX2)
+    // This conversion combines i4toi8 (above) and i8tof16 (below). Here we
+    // - read    256  bits (= 32  bytes, = 64  i4  elements)
+    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
+    // per every iteration, what translates to (from->size() / 64) iterations
+
+    const std::size_t total = to->get_size();
+    const std::size_t stotal = scale->get_size();
+    const std::size_t elementsPerScale = total / stotal;
+
+    // TODO: handle tails
+    NPUW_ASSERT(elementsPerScale % 64 == 0);
+
+    const int8_t* const pSrc = static_cast<int8_t*>(from->data());   // 2 x i4  elements
+    const int8_t* const pScl = static_cast<int8_t*>(scale->data());  // either f16 or f32
+    const int16_t* pDst = static_cast<int16_t*>(to->data());         // 1 x f16 element
+
+    auto unpack_body = [pSrc, pDst, pScl, elementsPerScale, scale_elem_type, stotal](std::size_t sindex,
+                                                                                     std::size_t stride) {
+        // number of vectorized operations per scale
+        size_t elementsPerScaleVectorized = elementsPerScale / 64;
+
+        int8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride;
+        int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride;
+        int16_t* pDstLocal = const_cast<int16_t*>(pDst) + 64 * elementsPerScaleVectorized * sindex * stride;
+
+        // if it is last iteration current stride can be smaller - lets check that
+        sindex *= stride;
+        const auto jobFinish = std::min(sindex + stride, stotal);
+
+        for (; sindex != jobFinish; sindex++) {
+            __m256 svec = avx2_load_scale(pSclLocal, scale_elem_type);
+            for (std::size_t index = 0; index < elementsPerScale; index += 64) {
+                __m256i inv = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrcLocal));
+                __m128i* outv[8] = {
+                    reinterpret_cast<__m128i*>(pDstLocal),
+                    reinterpret_cast<__m128i*>(pDstLocal + 8),
+                    reinterpret_cast<__m128i*>(pDstLocal + 16),
+                    reinterpret_cast<__m128i*>(pDstLocal + 24),
+                    reinterpret_cast<__m128i*>(pDstLocal + 32),
+                    reinterpret_cast<__m128i*>(pDstLocal + 40),
+                    reinterpret_cast<__m128i*>(pDstLocal + 48),
+                    reinterpret_cast<__m128i*>(pDstLocal + 56),
+                };
+
+                __m256i vout0, vout1;
+                avx2_i4toi8(inv, &vout0, &vout1);
+
+                int8_t tmp[64];  // FIXME: Avoid it
+                __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp);
+                __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32);
+                _mm256_storeu_si256(tmpv0, vout0);
+                _mm256_storeu_si256(tmpv1, vout1);
+
+                __m128i i8vecs[8] = {
+                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)),
+                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)),
+                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)),
+                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)),
+                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)),
+                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)),
+                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)),
+                    _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)),
+                };
+
+                __m128i vresults[8] = {avx2_i8tof16(i8vecs[0], svec),
+                                       avx2_i8tof16(i8vecs[1], svec),
+                                       avx2_i8tof16(i8vecs[2], svec),
+                                       avx2_i8tof16(i8vecs[3], svec),
+                                       avx2_i8tof16(i8vecs[4], svec),
+                                       avx2_i8tof16(i8vecs[5], svec),
+                                       avx2_i8tof16(i8vecs[6], svec),
+                                       avx2_i8tof16(i8vecs[7], svec)};
+
+                _mm_storeu_si128(outv[0], vresults[0]);
+                _mm_storeu_si128(outv[1], vresults[1]);
+                _mm_storeu_si128(outv[2], vresults[2]);
+                _mm_storeu_si128(outv[3], vresults[3]);
+                _mm_storeu_si128(outv[4], vresults[4]);
+                _mm_storeu_si128(outv[5], vresults[5]);
+                _mm_storeu_si128(outv[6], vresults[6]);
+                _mm_storeu_si128(outv[7], vresults[7]);
+
+                pSrcLocal += 32;  // shift pSrc only by 32 since it is 64 x i4
+                pDstLocal += 64;  // note pDst is int16_t
+            }
+            pSclLocal += scale_elem_type.size();
+        }
+    };
+    size_t stride{1};
+
+    // since scaling is always 64 elements aligned operations, lets partition only in scale shape
+    if (unpack_options.nPartitions) {
+        std::size_t minPartitions;
+        if (!unpack_options.bStrictPartitioning) {
+            // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least,
+            // so in terms of stride, it should be nElementsPerscale/64 * 2048
+            const auto nIntrinsicsPerScale = elementsPerScale / 64u;
+            auto minScaleStride = 2048u / nIntrinsicsPerScale;
+            minScaleStride = std::max<std::size_t>(1u, minScaleStride);
+            minPartitions = stotal / minScaleStride;
+            minPartitions = std::max<std::size_t>(1u, minPartitions);
+            minPartitions = std::min(minPartitions, unpack_options.nPartitions);
+        } else {
+            minPartitions = unpack_options.nPartitions;
+        }
+
+        // calculating stride in scale elements space
+        stride = static_cast<size_t>(stotal / minPartitions);
+    }
+
+    const size_t numWork = (stotal + stride - 1) / stride;
+
+    if (unpack_options.bUseOvParallelFor) {
+        ov::parallel_for(numWork, [unpack_body, stride](size_t index) {
+            unpack_body(index, stride);
+        });
+    } else {
+        for (std::size_t index = 0; index < numWork; index++) {
+            unpack_body(index, stride);
+        }
+    }
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_i4f16_z(const ov::SoPtr<ov::ITensor>& from,
+                                           const ov::SoPtr<ov::ITensor>& scale,
+                                           const ov::SoPtr<ov::ITensor>& to,
+                                           const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(scale->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+    const auto& from_shape = from->get_shape();
+    NPUW_ASSERT(from_shape.back() % 64 == 0);
+
+    const auto& scale_shape = scale->get_shape();
+    NPUW_ASSERT(scale_shape.size() == 3);
+    NPUW_ASSERT(scale_shape[0] == from_shape[0]);
+    NPUW_ASSERT(scale_shape[2] == from_shape[2]);
+    NPUW_ASSERT(scale_shape[1] == 1);
+
+    const auto scale_elem_type = scale->get_element_type();
+    NPUW_ASSERT(scale_elem_type == ov::element::f32);
+
+#if defined(HAVE_AVX2)
+    // This conversion combines i4tof32 and f32tof16. Here we
+    // - read    256  bits (= 32  bytes, = 64  u4  elements)
+    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
+    // per every iteration, what translates to (from->size() / 64) iterations
+
+    const size_t C = from_shape[from_shape.size() - 3];
+    const size_t H = from_shape[from_shape.size() - 2];
+    const size_t W = from_shape[from_shape.size() - 1];
+
+    const int8_t* const pSrc = static_cast<int8_t*>(from->data());  // 2 x i4  elements
+    const float* const pScl = static_cast<float*>(scale->data());   // 1 x f32 element
+    int16_t* pDst = static_cast<int16_t*>(to->data());              // 1 x f16 element
+
+    auto unpack_body = [&](size_t job_index, size_t stride) {
+        size_t start_c = job_index * stride;
+        size_t end_c = std::min(C, start_c + stride);
+
+        for (size_t c = start_c; c < end_c; ++c) {
+            for (size_t h = 0; h < H; ++h) {
+                for (size_t w = 0; w < W; w += 64) {
+                    const int8_t* pSrc_iter = pSrc + (w + W * h + W * H * c) / 2;
+                    __m256i vinput = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrc_iter));
+                    __m256i vout0, vout1;
+                    avx2_i4toi8(vinput, &vout0, &vout1);
+                    int8_t tmp[64];  // FIXME: Avoid it
+                    __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp);
+                    __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32);
+                    _mm256_storeu_si256(tmpv0, vout0);
+                    _mm256_storeu_si256(tmpv1, vout1);
+                    __m128i i8vecs[8] = {
+                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)),
+                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)),
+                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)),
+                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)),
+                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)),
+                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)),
+                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)),
+                        _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)),
+                    };
+
+                    const float* pScl_iter = pScl + w + W * c;
+                    __m256 svalVec[8];
+                    for (int i = 0; i < 8; ++i) {
+                        svalVec[i] = _mm256_loadu_ps(pScl_iter + i * 8);
+                    }
+
+                    __m128i vresults[8] = {avx2_i8tof16(i8vecs[0], svalVec[0]),
+                                           avx2_i8tof16(i8vecs[1], svalVec[1]),
+                                           avx2_i8tof16(i8vecs[2], svalVec[2]),
+                                           avx2_i8tof16(i8vecs[3], svalVec[3]),
+                                           avx2_i8tof16(i8vecs[4], svalVec[4]),
+                                           avx2_i8tof16(i8vecs[5], svalVec[5]),
+                                           avx2_i8tof16(i8vecs[6], svalVec[6]),
+                                           avx2_i8tof16(i8vecs[7], svalVec[7])};
+
+                    int16_t* pDst_iter = pDst + w + W * h + W * H * c;
+                    for (int i = 0; i < 8; ++i) {
+                        _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst_iter + i * 8), vresults[i]);
+                    }
+                }
+            }
+        }
+    };
+
+    size_t stride = C;
+    size_t num_jobs = 1;
+
+    if (unpack_options.nPartitions) {
+        if (unpack_options.bStrictPartitioning) {
+            stride = (C + unpack_options.nPartitions - 1) / unpack_options.nPartitions;
+            num_jobs = unpack_options.nPartitions;
+        } else {
+            stride = std::max<size_t>(1, C / unpack_options.nPartitions);
+            num_jobs = (C + stride - 1) / stride;
+        }
+    }
+
+    if (unpack_options.bUseOvParallelFor) {
+        ov::parallel_for(num_jobs, [&](size_t job_index) {
+            unpack_body(job_index, stride);
+        });
+    } else {
+        for (size_t job_index = 0; job_index < num_jobs; ++job_index) {
+            unpack_body(job_index, stride);
+        }
+    }
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_u4f16(const ov::SoPtr<ov::ITensor>& from,
+                                         const ov::SoPtr<ov::ITensor>& to,
+                                         const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+    NPUW_ASSERT(from->get_size() % 64 == 0);
+
+#if defined(HAVE_AVX2)
+    // This conversion combines u4i8 and i8tof16 unpacks. Here we
+    // - read    256  bits (= 32  bytes, = 64  i4  elements)
+    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
+    // per every iteration, what translates to (from->size() / 64) iterations
+
+    const std::size_t total = to->get_size();
+    int8_t const* pSrc = static_cast<int8_t*>(from->data());  // 2 x i4  elements
+    int16_t* pDst = static_cast<int16_t*>(to->data());        // 1 x f16 element
+
+    for (std::size_t index = 0; index < total; index += 64) {
+        __m128i* outv[8] = {
+            reinterpret_cast<__m128i*>(pDst),
+            reinterpret_cast<__m128i*>(pDst + 8),
+            reinterpret_cast<__m128i*>(pDst + 16),
+            reinterpret_cast<__m128i*>(pDst + 24),
+            reinterpret_cast<__m128i*>(pDst + 32),
+            reinterpret_cast<__m128i*>(pDst + 40),
+            reinterpret_cast<__m128i*>(pDst + 48),
+            reinterpret_cast<__m128i*>(pDst + 56),
+        };
+
+        int8_t tmp[64];  // FIXME: Avoid it
+        for (std::size_t ii = 0; ii < 32; ii++) {
+            tmp[ii * 2] = static_cast<int8_t>(lo4(pSrc[ii]));      // LSB is [0] -- since OpenVINO 24.0!
+            tmp[ii * 2 + 1] = static_cast<int8_t>(hi4(pSrc[ii]));  // MSB is [1] -- since OpenVINO 24.0!
+        }
+
+        __m128i vresults[8] = {
+            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp))),
+            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8))),
+            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16))),
+            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24))),
+            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32))),
+            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40))),
+            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48))),
+            avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56))),
+        };
+
+        _mm_storeu_si128(outv[0], vresults[0]);
+        _mm_storeu_si128(outv[1], vresults[1]);
+        _mm_storeu_si128(outv[2], vresults[2]);
+        _mm_storeu_si128(outv[3], vresults[3]);
+        _mm_storeu_si128(outv[4], vresults[4]);
+        _mm_storeu_si128(outv[5], vresults[5]);
+        _mm_storeu_si128(outv[6], vresults[6]);
+        _mm_storeu_si128(outv[7], vresults[7]);
+
+        pSrc += 32;  // shift pSrc only by 32 since it is 64 x i4
+        pDst += 64;  // note pDst is int16_t
+    }
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_u4f16_scale_zp(const ov::SoPtr<ov::ITensor>& from,
+                                                  const ov::SoPtr<ov::ITensor>& zerop,
+                                                  const ov::SoPtr<ov::ITensor>& scale,
+                                                  const ov::SoPtr<ov::ITensor>& to,
+                                                  const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(zerop->is_continuous());
+    NPUW_ASSERT(scale->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+    // Only single-size ZP is supported
+    NPUW_ASSERT(zerop->get_size() == 1);
+
+    const auto& from_shape = from->get_shape();
+    NPUW_ASSERT(from_shape.back() % 64 == 0);
+
+    // 2-channel (Symmetric) and 3-channel (group-wise)
+    // scale factors are supported. The scale/value loop
+    // iteration is based on stotal, so should work for
+    // both cases.
+    const auto& scale_shape = scale->get_shape();
+    NPUW_ASSERT(scale_shape.size() == 3 || scale_shape.size() == 2);
+    if (scale_shape.size() == 3) {
+        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
+        NPUW_ASSERT(scale_shape[1] == from_shape[1]);
+        NPUW_ASSERT(scale_shape[2] == 1);
+    } else {
+        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
+        NPUW_ASSERT(scale_shape[1] == 1);
+    }
+
+    const auto zerop_elem_type = zerop->get_element_type();
+    const auto scale_elem_type = scale->get_element_type();
+    NPUW_ASSERT(zerop_elem_type == ov::element::u4);
+    NPUW_ASSERT(scale_elem_type == ov::element::f16);
+
+#if defined(HAVE_AVX2)
+    // This conversion combines u4tof32 and f32tof16. Here we
+    // - read    256  bits (= 32  bytes, = 64  u4  elements)
+    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
+    // per every iteration, what translates to (from->size() / 64) iterations
+
+    const std::size_t total = to->get_size();
+    const std::size_t stotal = scale->get_size();
+    const std::size_t elementsPerScale = total / stotal;
+
+    const uint8_t* const pSrc = static_cast<uint8_t*>(from->data());   // 2 x u4  elements
+    const uint8_t* const pZer = static_cast<uint8_t*>(zerop->data());  // 1 x u4  element
+    const int8_t* const pScl = static_cast<int8_t*>(scale->data());    // 1 x f16 element
+    const int16_t* pDst = static_cast<int16_t*>(to->data());           // 1 x f16 element
+
+    const float zval = static_cast<float>(lo4(*pZer));  // MSB - since OpenVINO 24.0!
+
+    __m256 zvalVec = _mm256_set1_ps(zval);
+
+    auto unpack_body = [pSrc, pDst, pScl, zvalVec, elementsPerScale, scale_elem_type, stotal](std::size_t sindex,
+                                                                                              std::size_t stride) {
+        // number of vectorized operations per scale
+        size_t elementsPerScaleVectorized = elementsPerScale / 64;
+
+        uint8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride;
+        int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride;
+        int16_t* pDstLocal = const_cast<int16_t*>(pDst) + 64 * elementsPerScaleVectorized * sindex * stride;
+
+        // if it is last iteration current stride can be smaller - lets check that
+        sindex *= stride;
+        const auto jobFinish = std::min(sindex + stride, stotal);
+
+        for (; sindex < jobFinish; sindex++) {
+            __m256 svalVec = avx2_load_scale(pSclLocal, scale_elem_type);
+
+            for (std::size_t index = 0; index < elementsPerScale; index += 64) {
+                __m128i* outv[] = {
+                    reinterpret_cast<__m128i*>(pDstLocal),
+                    reinterpret_cast<__m128i*>(pDstLocal + 8),
+                    reinterpret_cast<__m128i*>(pDstLocal + 16),
+                    reinterpret_cast<__m128i*>(pDstLocal + 24),
+                    reinterpret_cast<__m128i*>(pDstLocal + 32),
+                    reinterpret_cast<__m128i*>(pDstLocal + 40),
+                    reinterpret_cast<__m128i*>(pDstLocal + 48),
+                    reinterpret_cast<__m128i*>(pDstLocal + 56),
+                };
+                __m256i himask = _mm256_set1_epi8(static_cast<char>(0xF0));
+                __m256i lomask = _mm256_set1_epi8(static_cast<char>(0x0F));
+
+                // loading 256 bit u4 into unalligned memory , so 64 elements
+                // cannot use aligned version here like _mm256_load_si256 - segfault even on unit tests
+                __m256i xmmData = _mm256_lddqu_si256(reinterpret_cast<__m256i const*>(pSrcLocal));
+
+                // unpacking with interleaving
+                __m256i vht = _mm256_and_si256(xmmData, himask);
+                __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4);          // 32 x i8
+                __m256i xmmUnpackedHi = _mm256_and_si256(xmmData, lomask);  // 32 x i8
+
+                // need 4 portions of 8 x i8 elements
+                __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo);       //  lower  16 x i8
+                __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1);  //  higher 16 x i8
+
+                __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi);       //  lower  16 x i8
+                __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1);  //  higher 16 x i8
+
+                // converting to 32 x f16
+                __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, svalVec),
+                                     avx2_u8tof16_lo(unpacked32LoLo, zvalVec, svalVec)};
+
+                __m128i f16LoHi[] = {
+                    avx2_u8tof16_hi(unpacked32LoHi, zvalVec, svalVec),
+                    avx2_u8tof16_lo(unpacked32LoHi, zvalVec, svalVec),
+                };
+
+                __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, svalVec),
+                                     avx2_u8tof16_lo(unpacked32HiLo, zvalVec, svalVec)};
+                __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, svalVec),
+                                     avx2_u8tof16_lo(unpacked32HiHi, zvalVec, svalVec)};
+
+                // interleaving back
+                __m128i interleaved[] = {_mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]),
+                                         _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]),
+                                         _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]),
+                                         _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]),
+                                         _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]),
+                                         _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]),
+                                         _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]),
+                                         _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1])};
+
+                // store the results
+                _mm_storeu_si128(outv[0], interleaved[0]);
+                _mm_storeu_si128(outv[1], interleaved[1]);
+                _mm_storeu_si128(outv[2], interleaved[2]);
+                _mm_storeu_si128(outv[3], interleaved[3]);
+                _mm_storeu_si128(outv[4], interleaved[4]);
+                _mm_storeu_si128(outv[5], interleaved[5]);
+                _mm_storeu_si128(outv[6], interleaved[6]);
+                _mm_storeu_si128(outv[7], interleaved[7]);
+
+                pSrcLocal += 32;  // shift pSrc only by 32 since it is 64 x u4
+                pDstLocal += 64;  // note pDst is int16_t, so 64 x f16 -> 64 elements
+            }                     // for(index)
+            pSclLocal += scale_elem_type.size();
+        }  // for(sindex)
+    };
+
+    size_t stride{1};
+
+    // since scaling is always 64 elements aligned operations, lets partition only in scale shape
+    if (unpack_options.nPartitions) {
+        std::size_t minPartitions;
+        if (!unpack_options.bStrictPartitioning) {
+            // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least,
+            // so in terms of stride, it should be nElementsPerscale/64 * 2048
+            const auto nIntrinsicsPerScale = elementsPerScale / 64u;
+            auto minScaleStride = 2048u / nIntrinsicsPerScale;
+            minScaleStride = std::max<std::size_t>(1u, minScaleStride);
+            minPartitions = stotal / minScaleStride;
+            minPartitions = std::max<std::size_t>(1u, minPartitions);
+            minPartitions = std::min(minPartitions, unpack_options.nPartitions);
+        } else {
+            minPartitions = unpack_options.nPartitions;
+        }
+
+        // calculating stride in scale elements space
+        stride = static_cast<size_t>(stotal / minPartitions);
+    }
+
+    const size_t numWork = (stotal + stride - 1) / stride;
+
+    if (unpack_options.bUseOvParallelFor) {
+        ov::parallel_for(numWork, [unpack_body, stride](size_t index) {
+            unpack_body(index, stride);
+        });
+    } else {
+        for (std::size_t index = 0; index < numWork; index++) {
+            unpack_body(index, stride);
+        }
+    }
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_u4f16_asymm_zp(const ov::SoPtr<ov::ITensor>& from,
+                                                  const ov::SoPtr<ov::ITensor>& zerop,
+                                                  const ov::SoPtr<ov::ITensor>& scale,
+                                                  const ov::SoPtr<ov::ITensor>& to,
+                                                  const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(zerop->is_continuous());
+    NPUW_ASSERT(scale->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+    const auto& from_shape = from->get_shape();
+    NPUW_ASSERT(from_shape.back() % 64 == 0);
+
+    // 3-channel (group-wise) scale factors are
+    // supported.
+
+    const auto& scale_shape = scale->get_shape();
+    NPUW_ASSERT(scale_shape.size() == 3);
+    if (scale_shape.size() == 3) {
+        NPUW_ASSERT(scale_shape[0] == from_shape[0]);
+        NPUW_ASSERT(scale_shape[1] == from_shape[1]);
+        NPUW_ASSERT(scale_shape[2] == 1);
+    }
+
+    const auto& zerop_shape = zerop->get_shape();
+    NPUW_ASSERT(zerop_shape.size() == 3);
+    if (zerop_shape.size() == 3) {
+        NPUW_ASSERT(zerop_shape[0] == from_shape[0]);
+        NPUW_ASSERT(zerop_shape[1] == from_shape[1]);
+        NPUW_ASSERT(zerop_shape[2] == 1);
+    }
+
+    const auto zerop_elem_type = zerop->get_element_type();
+    const auto scale_elem_type = scale->get_element_type();
+    NPUW_ASSERT(zerop_elem_type == ov::element::u4);
+    NPUW_ASSERT(scale_elem_type == ov::element::f16);
+
+#if defined(HAVE_AVX2)
+    // This conversion combines u4tof32 and f32tof16. Here we
+    // - read    256  bits (= 32  bytes, = 64  u4  elements)
+    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
+    // per every iteration, what translates to (from->size() / 64) iterations
+
+    const std::size_t total = to->get_size();
+    const std::size_t stotal = scale->get_size();
+    const std::size_t elementsPerScale = total / stotal;
+
+    const uint8_t* const pSrc = static_cast<uint8_t*>(from->data());   // 2 x u4  elements
+    const uint8_t* const pZer = static_cast<uint8_t*>(zerop->data());  // 2 x u4  element
+    const int8_t* const pScl = static_cast<int8_t*>(scale->data());    // 1 x f16 element
+    const int16_t* pDst = static_cast<int16_t*>(to->data());           // 1 x f16 element
+
+    auto unpack_body = [pSrc, pDst, pScl, pZer, elementsPerScale, scale_elem_type, zerop_elem_type, stotal](
+                           std::size_t sindex,
+                           std::size_t stride) {
+        // number of vectorized operations per scale
+        size_t elementsPerScaleVectorized = elementsPerScale / 64;
+
+        uint8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride;
+        int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride;
+        uint8_t const* pZerLocal = pZer + zerop_elem_type.size() * sindex * stride / 2;
+        int16_t* pDstLocal = const_cast<int16_t*>(pDst) + 64 * elementsPerScaleVectorized * sindex * stride;
+
+        // if it is last iteration current stride can be smaller - lets check that
+        sindex *= stride;
+        const auto jobFinish = std::min(sindex + stride, stotal);
+
+        for (; sindex < jobFinish; sindex++) {
+            __m256 svalVec = avx2_load_scale(pSclLocal, scale_elem_type);
+            __m256 zvalVec = _mm256_set1_ps(static_cast<float>((sindex % 2 == 0) ? lo4(*pZerLocal) : hi4(*pZerLocal)));
+
+            for (std::size_t index = 0; index < elementsPerScale; index += 64) {
+                __m128i* outv[] = {
+                    reinterpret_cast<__m128i*>(pDstLocal),
+                    reinterpret_cast<__m128i*>(pDstLocal + 8),
+                    reinterpret_cast<__m128i*>(pDstLocal + 16),
+                    reinterpret_cast<__m128i*>(pDstLocal + 24),
+                    reinterpret_cast<__m128i*>(pDstLocal + 32),
+                    reinterpret_cast<__m128i*>(pDstLocal + 40),
+                    reinterpret_cast<__m128i*>(pDstLocal + 48),
+                    reinterpret_cast<__m128i*>(pDstLocal + 56),
+                };
+                __m256i himask = _mm256_set1_epi8(static_cast<char>(0xF0));
+                __m256i lomask = _mm256_set1_epi8(static_cast<char>(0x0F));
+
+                // loading 256 bit u4 into unalligned memory , so 64 elements
+                // cannot use aligned version here like _mm256_load_si256 - segfault even on unit tests
+                __m256i xmmData = _mm256_lddqu_si256(reinterpret_cast<__m256i const*>(pSrcLocal));
+
+                // unpacking with interleaving
+                __m256i vht = _mm256_and_si256(xmmData, himask);
+                __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4);          // 32 x i8
+                __m256i xmmUnpackedHi = _mm256_and_si256(xmmData, lomask);  // 32 x i8
+
+                // need 4 portions of 8 x i8 elements
+                __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo);       //  lower  16 x i8
+                __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1);  //  higher 16 x i8
+
+                __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi);       //  lower  16 x i8
+                __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1);  //  higher 16 x i8
+
+                // converting to 32 x f16
+                __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, svalVec),
+                                     avx2_u8tof16_lo(unpacked32LoLo, zvalVec, svalVec)};
+
+                __m128i f16LoHi[] = {
+                    avx2_u8tof16_hi(unpacked32LoHi, zvalVec, svalVec),
+                    avx2_u8tof16_lo(unpacked32LoHi, zvalVec, svalVec),
+                };
+
+                __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, svalVec),
+                                     avx2_u8tof16_lo(unpacked32HiLo, zvalVec, svalVec)};
+                __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, svalVec),
+                                     avx2_u8tof16_lo(unpacked32HiHi, zvalVec, svalVec)};
+
+                // interleaving back
+                __m128i interleaved[] = {_mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]),
+                                         _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]),
+                                         _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]),
+                                         _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]),
+                                         _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]),
+                                         _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]),
+                                         _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]),
+                                         _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1])};
+
+                // store the results
+                _mm_storeu_si128(outv[0], interleaved[0]);
+                _mm_storeu_si128(outv[1], interleaved[1]);
+                _mm_storeu_si128(outv[2], interleaved[2]);
+                _mm_storeu_si128(outv[3], interleaved[3]);
+                _mm_storeu_si128(outv[4], interleaved[4]);
+                _mm_storeu_si128(outv[5], interleaved[5]);
+                _mm_storeu_si128(outv[6], interleaved[6]);
+                _mm_storeu_si128(outv[7], interleaved[7]);
+
+                pSrcLocal += 32;  // shift pSrc only by 32 since it is 64 x u4
+                pDstLocal += 64;  // note pDst is int16_t, so 64 x f16 -> 64 elements
+            }                     // for(index)
+            pSclLocal += scale_elem_type.size();
+            if (sindex % 2 == 1) {
+                pZerLocal += zerop_elem_type.size();
+            }
+        }  // for(sindex)
+    };
+
+    size_t stride{1};
+
+    // since scaling is always 64 elements aligned operations, lets partition only in scale shape
+    if (unpack_options.nPartitions) {
+        std::size_t minPartitions;
+        if (!unpack_options.bStrictPartitioning) {
+            // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least,
+            // so in terms of stride, it should be nElementsPerscale/64 * 2048
+            const auto nIntrinsicsPerScale = elementsPerScale / 64u;
+            auto minScaleStride = 2048u / nIntrinsicsPerScale;
+            minScaleStride = std::max<std::size_t>(1u, minScaleStride);
+            minPartitions = stotal / minScaleStride;
+            minPartitions = std::max<std::size_t>(1u, minPartitions);
+            minPartitions = std::min(minPartitions, unpack_options.nPartitions);
+        } else {
+            minPartitions = unpack_options.nPartitions;
+        }
+
+        // calculating stride in scale elements space
+        stride = static_cast<size_t>(stotal / minPartitions);
+    }
+
+    const size_t numWork = (stotal + stride - 1) / stride;
+
+    if (unpack_options.bUseOvParallelFor) {
+        ov::parallel_for(numWork, [unpack_body, stride](size_t index) {
+            unpack_body(index, stride);
+        });
+    } else {
+        for (std::size_t index = 0; index < numWork; index++) {
+            unpack_body(index, stride);
+        }
+    }
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_u4f16_z(const ov::SoPtr<ov::ITensor>& from,
+                                           const ov::SoPtr<ov::ITensor>& zerop,
+                                           const ov::SoPtr<ov::ITensor>& scale,
+                                           const ov::SoPtr<ov::ITensor>& to,
+                                           const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(zerop->is_continuous());
+    NPUW_ASSERT(scale->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+    // Only single-size ZP is supported
+    NPUW_ASSERT(zerop->get_size() == 1);
+
+    const auto& from_shape = from->get_shape();
+    NPUW_ASSERT(from_shape.back() % 64 == 0);
+
+    const auto& scale_shape = scale->get_shape();
+    NPUW_ASSERT(scale_shape.size() == 3);
+    NPUW_ASSERT(scale_shape[0] == from_shape[0]);
+    NPUW_ASSERT(scale_shape[2] == from_shape[2]);
+    NPUW_ASSERT(scale_shape[1] == 1);
+
+    const auto zerop_elem_type = zerop->get_element_type();
+    const auto scale_elem_type = scale->get_element_type();
+    NPUW_ASSERT(zerop_elem_type == ov::element::f32);
+    NPUW_ASSERT(scale_elem_type == ov::element::f32);
+
+#if defined(HAVE_AVX2)
+    // This conversion combines u4tof32 and f32tof16. Here we
+    // - read    256  bits (= 32  bytes, = 64  u4  elements)
+    // - write   1024 bits (= 128 bytes, = 64  f16 elements)
+    // per every iteration, what translates to (from->size() / 64) iterations
+
+    const size_t C = from_shape[from_shape.size() - 3];
+    const size_t H = from_shape[from_shape.size() - 2];
+    const size_t W = from_shape[from_shape.size() - 1];
+
+    const uint8_t* const pSrc = static_cast<uint8_t*>(from->data());  // 2 x u4  elements
+    const float* const pScl = static_cast<float*>(scale->data());     // 1 x f32 element
+    int16_t* pDst = static_cast<int16_t*>(to->data());                // 1 x f16 element
+
+    const float zval = avx2_load_f32(reinterpret_cast<const int8_t*>(zerop->data()), zerop_elem_type);
+    __m256 zvalVec = _mm256_set1_ps(zval);
+
+    auto unpack_body = [&](size_t job_index, size_t stride) {
+        size_t start_c = job_index * stride;
+        size_t end_c = std::min(C, start_c + stride);
+
+        for (size_t c = start_c; c < end_c; ++c) {
+            for (size_t h = 0; h < H; ++h) {
+                for (size_t w = 0; w < W; w += 64) {
+                    const uint8_t* pSrc_iter = pSrc + (w + W * h + W * H * c) / 2;
+                    __m256i vinput = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrc_iter));
+                    const float* pScl_iter = pScl + w + W * c;
+                    int16_t* pDst_iter = pDst + w + W * h + W * H * c;
+
+                    __m256 svalVec[8];
+                    for (int i = 0; i < 8; ++i) {
+                        svalVec[i] = _mm256_loadu_ps(pScl_iter + i * 8);
+                    }
+
+                    // vectorized unpack u4 to f16
+                    __m128i htmp[8];  // 64 x f16
+                    avx2_u4tof16(vinput, htmp, zvalVec, svalVec);
+
+                    for (int i = 0; i < 8; ++i) {
+                        _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst_iter + i * 8), htmp[i]);
+                    }
+                }
+            }
+        }
+    };
+
+    size_t stride = C;
+    size_t num_jobs = 1;
+
+    if (unpack_options.nPartitions) {
+        if (unpack_options.bStrictPartitioning) {
+            stride = (C + unpack_options.nPartitions - 1) / unpack_options.nPartitions;
+            num_jobs = unpack_options.nPartitions;
+        } else {
+            stride = std::max<size_t>(1, C / unpack_options.nPartitions);
+            num_jobs = (C + stride - 1) / stride;
+        }
+    }
+
+    if (unpack_options.bUseOvParallelFor) {
+        ov::parallel_for(num_jobs, [&](size_t job_index) {
+            unpack_body(job_index, stride);
+        });
+    } else {
+        for (size_t job_index = 0; job_index < num_jobs; ++job_index) {
+            unpack_body(job_index, stride);
+        }
+    }
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_u4f32(const ov::SoPtr<ov::ITensor>& from,
+                                         const ov::SoPtr<ov::ITensor>& to,
+                                         const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+    uint8_t const* pSrc = static_cast<uint8_t*>(from->data());  // 2 x u4 elements
+    float* pDst = static_cast<float*>(to->data());              // 1 x f32 element
+
+    const std::size_t total = from->get_size();
+    for (std::size_t index = 0; index < total; index += 2) {
+        pDst[0] = static_cast<float>(lo4(*pSrc));  // LSB is [0] - since OpenVINO 2024.0!
+        pDst[1] = static_cast<float>(hi4(*pSrc));  // MSB is [1] - since OpenVINO 2024.0!
+        pSrc++;
+        pDst += 2;
+    }
+}
+
+void ov::npuw::util::XARCH::unpack_i8f16(const ov::SoPtr<ov::ITensor>& from,
+                                         const ov::SoPtr<ov::ITensor>& to,
+                                         const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+    NPUW_ASSERT(from->get_size() % 8 == 0);
+
+#if defined(HAVE_AVX2)
+    constexpr std::size_t VECSIZE = 8;
+
+    const std::size_t total = from->get_size();
+    int8_t const* pSrc = from->data<int8_t>();
+    int16_t* pDst = static_cast<int16_t*>(to->data());
+
+    for (std::size_t index = 0; index < total; index += VECSIZE) {
+        const __m128i* pSrcV = reinterpret_cast<const __m128i*>(pSrc);
+        __m128i* pDstV = reinterpret_cast<__m128i*>(pDst);
+        __m128i i8vec = _mm_loadl_epi64(pSrcV);  // load:    8 x i8  [ 64b of 128b]
+        __m128i f16vec = avx2_i8tof16(i8vec);
+        _mm_store_si128(pDstV, f16vec);  // store:   8 x f16 [128b]
+        pSrc += 8;
+        pDst += 8;
+    }
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_i8f16_scale(const ov::SoPtr<ov::ITensor>& from,
+                                               const ov::SoPtr<ov::ITensor>& scale,
+                                               const ov::SoPtr<ov::ITensor>& to,
+                                               const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(scale->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+    NPUW_ASSERT(from->get_size() % 8 == 0);
+    NPUW_ASSERT(scale->get_shape()[0] == from->get_shape()[0]);
+    NPUW_ASSERT(scale->get_shape()[1] == 1);
+
+    const auto scale_elem_type = scale->get_element_type();
+    NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16);
+
+#if defined(HAVE_AVX2)
+    constexpr std::size_t VECSIZE = 8;
+
+    const std::size_t total = from->get_size();
+    const std::size_t stotal = scale->get_size();
+    int8_t const* pSrc = from->data<int8_t>();
+    int8_t const* pScl = static_cast<int8_t*>(scale->data());
+    int16_t* pDst = static_cast<int16_t*>(to->data());
+
+    for (std::size_t sindex = 0u; sindex < stotal; sindex++) {
+        __m256 svec = avx2_load_scale(pScl, scale_elem_type);
+        for (std::size_t index = 0u; index < (total / stotal); index += VECSIZE) {
+            __m128i const* pSrcV = reinterpret_cast<const __m128i*>(pSrc);
+            __m128i* pDstV = reinterpret_cast<__m128i*>(pDst);
+            __m128i i8vec = _mm_loadl_epi64(pSrcV);      // load:    8 x i8  [ 64b of 128b]
+            __m128i f16vec = avx2_i8tof16(i8vec, svec);  // convert & scale
+            _mm_store_si128(pDstV, f16vec);              // store:   8 x f16 [128b]
+            pSrc += 8;
+            pDst += 8;
+        }  // index
+        pScl += scale_elem_type.size();
+    }  // sindex
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+void ov::npuw::util::XARCH::unpack_u8f16(const ov::SoPtr<ov::ITensor>& from,
+                                         const ov::SoPtr<ov::ITensor>& zerop,
+                                         const ov::SoPtr<ov::ITensor>& scale,
+                                         const ov::SoPtr<ov::ITensor>& to,
+                                         const ov::npuw::util::UnpackOptions& _options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(zerop->is_continuous());
+    NPUW_ASSERT(scale->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+    NPUW_ASSERT(from->get_size() % 8 == 0);
+    NPUW_ASSERT(scale->get_shape()[0] == from->get_shape()[0]);
+    NPUW_ASSERT(scale->get_shape()[1] == 1);
+    NPUW_ASSERT(zerop->get_shape()[0] == from->get_shape()[0]);
+    NPUW_ASSERT(zerop->get_shape()[1] == 1);
+
+    const auto scale_elem_type = scale->get_element_type();
+    NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16);
+
+    const auto zerop_elem_type = zerop->get_element_type();
+    NPUW_ASSERT(zerop_elem_type == ov::element::u8);
+
+#if defined(HAVE_AVX2)
+    constexpr std::size_t VECSIZE = 8;
+
+    const std::size_t total = from->get_size();
+    const std::size_t stotal = scale->get_size();
+    uint8_t const* pSrc = from->data<uint8_t>();
+    uint8_t const* pZrp = zerop->data<uint8_t>();
+    int8_t const* pScl = static_cast<int8_t*>(scale->data());
+    int16_t* pDst = static_cast<int16_t*>(to->data());
+
+    for (std::size_t sindex = 0u; sindex < stotal; sindex++) {
+        __m256 svec = avx2_load_scale(pScl, scale_elem_type);
+        __m128i u8zp = _mm_set1_epi8(*pZrp);         // bcast:   8 x u8
+        __m256i u32zp = _mm256_cvtepu8_epi32(u8zp);  // i32 zero point
+        __m256 f32zp = _mm256_cvtepi32_ps(u32zp);    // f32 zero point
+        for (std::size_t index = 0u; index < (total / stotal); index += VECSIZE) {
+            __m128i const* pSrcV = reinterpret_cast<const __m128i*>(pSrc);
+            __m128i* pDstV = reinterpret_cast<__m128i*>(pDst);
+            __m128i u8in = _mm_loadl_epi64(pSrcV);             // load:    8 x u8
+            __m128i f16vec = avx2_u8tof16(u8in, f32zp, svec);  // convert & scale
+            _mm_store_si128(pDstV, f16vec);                    // store:   8 x f16
+            pSrc += VECSIZE;
+            pDst += VECSIZE;
+        }  // index
+        pScl += scale_elem_type.size();
+        pZrp++;
+    }  // sindex
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+}
+
+ov::Tensor ov::npuw::util::XARCH::to_f16(const ov::Tensor& t) {
+    ov::Shape shape = t.get_shape();
+    NPUW_ASSERT(t.get_element_type() == ov::element::f32);
+    NPUW_ASSERT(t.get_size() % 8 == 0);
+    NPUW_ASSERT(t.is_continuous());
+
+    ov::Tensor tnew(ov::element::f16, shape);
+
+#if defined(HAVE_AVX2)
+    const float* psrc = t.data<float>();
+    uint8_t* pdst = static_cast<uint8_t*>(tnew.data());
+
+    for (std::size_t i = 0; i < t.get_size() / 8; i++) {
+        __m256 vsrc = _mm256_loadu_ps(psrc);
+        __m128i vout = _mm256_cvtps_ph(vsrc, _MM_FROUND_TO_NEAREST_INT);
+        __m128i* pout = reinterpret_cast<__m128i*>(pdst);
+        _mm_storeu_si128(pout, vout);
+        psrc += 8;        // offset in sizeof(float)
+        pdst += (8 * 2);  // offset in bytes
+    }
+#else
+    OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!");
+#endif
+    return tnew;
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp
new file mode 100644
index 00000000000000..0f0d9912f3b221
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+
+#include "logging.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/runtime/itensor.hpp"
+#include "openvino/runtime/so_ptr.hpp"
+#include "util.hpp"
+
+namespace ov {
+namespace npuw {
+namespace util {
+namespace XARCH {
+
+void unpack_i4i8(const ov::SoPtr<ov::ITensor>& from,
+                 const ov::SoPtr<ov::ITensor>& to,
+                 const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_u4i8(const ov::SoPtr<ov::ITensor>& from,
+                 const ov::SoPtr<ov::ITensor>& to,
+                 const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_i4f16(const ov::SoPtr<ov::ITensor>& from,
+                  const ov::SoPtr<ov::ITensor>& to,
+                  const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_i4f16_scale(const ov::SoPtr<ov::ITensor>& from,
+                        const ov::SoPtr<ov::ITensor>& scale,
+                        const ov::SoPtr<ov::ITensor>& to,
+                        const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_i4f16_z(const ov::SoPtr<ov::ITensor>& from,
+                    const ov::SoPtr<ov::ITensor>& scale,
+                    const ov::SoPtr<ov::ITensor>& to,
+                    const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_u4f16(const ov::SoPtr<ov::ITensor>& from,
+                  const ov::SoPtr<ov::ITensor>& to,
+                  const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_u4f16_scale_zp(const ov::SoPtr<ov::ITensor>& from,
+                           const ov::SoPtr<ov::ITensor>& zerop,
+                           const ov::SoPtr<ov::ITensor>& scale,
+                           const ov::SoPtr<ov::ITensor>& to,
+                           const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_u4f16_asymm_zp(const ov::SoPtr<ov::ITensor>& from,
+                           const ov::SoPtr<ov::ITensor>& zerop,
+                           const ov::SoPtr<ov::ITensor>& scale,
+                           const ov::SoPtr<ov::ITensor>& to,
+                           const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_u4f16_z(const ov::SoPtr<ov::ITensor>& from,
+                    const ov::SoPtr<ov::ITensor>& zerop,
+                    const ov::SoPtr<ov::ITensor>& scale,
+                    const ov::SoPtr<ov::ITensor>& to,
+                    const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_u4f32(const ov::SoPtr<ov::ITensor>& from,
+                  const ov::SoPtr<ov::ITensor>& to,
+                  const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_i8f16(const ov::SoPtr<ov::ITensor>& from,
+                  const ov::SoPtr<ov::ITensor>& to,
+                  const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_i8f16_scale(const ov::SoPtr<ov::ITensor>& from,
+                        const ov::SoPtr<ov::ITensor>& scale,
+                        const ov::SoPtr<ov::ITensor>& to,
+                        const ov::npuw::util::UnpackOptions& unpack_options);
+
+void unpack_u8f16(const ov::SoPtr<ov::ITensor>& from,
+                  const ov::SoPtr<ov::ITensor>& zerop,
+                  const ov::SoPtr<ov::ITensor>& scale,
+                  const ov::SoPtr<ov::ITensor>& to,
+                  const ov::npuw::util::UnpackOptions& _options);
+
+ov::Tensor to_f16(const ov::Tensor& t);
+
+}  // namespace XARCH
+}  // namespace util
+}  // namespace npuw
+}  // namespace ov

From 54db50b893b72990a10381710edb2bf4b506f78e Mon Sep 17 00:00:00 2001
From: Wilson Seok <wilson.seok@intel.com>
Date: Thu, 17 Oct 2024 04:43:10 -0700
Subject: [PATCH 07/32] [GPU] Fix weight reorder src format to avoid
 inconsistency ndims of src/dst in weight reorder (#27051)

### Details:
- Fix weight reorder src format to avoid inconsistancy ndims of src/dst
in weight reorder

### Tickets:
 - 154614
---
 src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
index b8ff112cead147..19ea02c7c66d28 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
@@ -596,6 +596,14 @@ bool keep_weights_reorder_shape_consistent(cldnn::layout& layout, const dnnl::me
     // Check whether they have same values and orders.
     if (filtered_target_dims == filtered_desc_dims) {
         layout.set_partial_shape(desc_dims);
+        if (layout.get_rank() != desc_dims.size()) {
+            if (cldnn::format::is_default_format(layout.format)) {
+                layout.format = cldnn::format::get_default_format(desc_dims.size());
+            } else {
+                // TO-DO: Consider that weight format is not default format
+                return false;
+            }
+        }
         return true;
     } else {
         return false;

From 2f62be0a7ba139e6ed1580d2b48cef6d3d986127 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Thu, 17 Oct 2024 21:17:08 +0800
Subject: [PATCH 08/32] [CPU] Support different Key/Value head size and not
 multiples 16 of head size length for SDPA/PA (#26945)

### Details:
 - *Support different kv head size for SDPA and PagedAttention*
 - *Support not 16 times kv head size for PagedAttention*

### Tickets:
 - *[152445](https://jira.devtools.intel.com/browse/CVS-152445)*
 - *[145986](https://jira.devtools.intel.com/browse/CVS-145986)*
---
 .../state_management_pattern.cpp              |   8 +-
 src/core/src/op/paged_attention.cpp           |  28 ++-
 .../nodes/kernels/scaled_attn/attn_memcpy.cpp |  20 +-
 .../nodes/kernels/scaled_attn/attn_quant.cpp  |   9 +-
 .../nodes/kernels/scaled_attn/executor_pa.cpp | 108 +++++++----
 .../kernels/scaled_attn/mha_single_token.cpp  |  21 +-
 .../intel_cpu/src/nodes/paged_attn.cpp        |  22 ++-
 .../intel_cpu/src/nodes/scaled_attn.cpp       | 181 ++++++++++--------
 .../shape_inference/custom/scaled_attn.cpp    |  32 +++-
 .../cpu_opset/common/op/sdpa.cpp              |  37 +++-
 .../subgraph_tests/src/arm/concat_sdp.cpp     |   1 +
 .../subgraph_tests/src/classes/concat_sdp.cpp |  49 +++--
 .../subgraph_tests/src/classes/concat_sdp.hpp |   6 +-
 .../subgraph_tests/src/common/concat_sdp.cpp  |   1 +
 .../subgraph_tests/src/x64/concat_sdp.cpp     |   1 +
 15 files changed, 336 insertions(+), 188 deletions(-)

diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp
index c259e9387d9dd0..28e7cd90019b34 100644
--- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp
+++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp
@@ -383,12 +383,18 @@ ov::pass::StateManagementPattern::StateManagementPattern(ParameterVector& kv_par
 
         auto paged_attention = std::make_shared<ov::op::PagedAttentionExtension>(pa_arguments);
 
+        // The output shape of PagedAttention will be converted to [batch, 1, head_num, head_size_v], the head_size_v
+        // may be different from head_size_q/head_size_k. The head_size_v could be got from the shape of value input
+        auto hidden_dim_v = std::make_shared<v8::Gather>(std::make_shared<v3::ShapeOf>(v_target_layout),
+                                                         v0::Constant::create(element::i64, Shape{}, {-1}),
+                                                         v0::Constant::create(element::i64, Shape{}, {0}));
+
         auto pa_shape = std::make_shared<v0::Concat>(
             OutputVector{
                 v0::Constant::create(element::i64, Shape{1}, {0}),
                 v0::Constant::create(element::i64, Shape{1}, {1}),
                 v0::Constant::create(element::i64, Shape{1}, {-1}),
-                std::make_shared<v0::Unsqueeze>(hidden_dim, v0::Constant::create(element::i64, Shape{}, {0})),
+                std::make_shared<v0::Unsqueeze>(hidden_dim_v, v0::Constant::create(element::i64, Shape{}, {0})),
             },
             0);
         auto pa_reshape = std::make_shared<v1::Reshape>(paged_attention->output(0), pa_shape, true);
diff --git a/src/core/src/op/paged_attention.cpp b/src/core/src/op/paged_attention.cpp
index 261b0ce1c47605..cdcb66e86ee33e 100644
--- a/src/core/src/op/paged_attention.cpp
+++ b/src/core/src/op/paged_attention.cpp
@@ -4,6 +4,7 @@
 
 #include "openvino/op/paged_attention.hpp"
 
+#include "dimension_util.hpp"
 #include "itt.hpp"
 #include "openvino/op/op.hpp"
 
@@ -146,10 +147,33 @@ void PagedAttentionExtension::validate_and_infer_types() {
                           get_input_element_type(12),
                           ".");
 
+    // value head_size may be not same with key
+    auto out_ps = get_input_partial_shape(0);
+    const auto& key_ps = get_input_partial_shape(1);
+    const auto& value_ps = get_input_partial_shape(2);
+    if (out_ps.rank().is_static()) {
+        if (key_ps.rank().is_static() && value_ps.rank().is_static() && key_ps[1].is_static()) {
+            // The dim of out_ps[1] should be `num_heads * v_head_size`, it can be got from:
+            // because:
+            //   q: query_ps[1] = num_heads * head_size
+            //   k: key_ps[1] = num_kv_heads * head_size
+            //   v: value_ps[1] = num_kv_heads * v_head_size
+            // therefore:
+            //   q * v / k = (num_heads * head_size) * (num_kv_heads * v_head_size) /
+            //               (num_kv_heads * head_size) = num_heads * v_head_size
+            out_ps[1] = out_ps[1] * value_ps[1] / key_ps[1].get_length();
+            NODE_VALIDATION_CHECK(this,
+                                  !ov::util::dim::is_empty(out_ps[1]),
+                                  "The last dimension of output should not be empty.");
+        } else {
+            out_ps[1] = Dimension::dynamic();
+        }
+    }
+
     if (m_output_type[0] == ov::element::undefined) {
-        set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+        set_output_type(0, get_input_element_type(0), out_ps);
     } else {
-        set_output_type(0, m_output_type[0], get_input_partial_shape(0));
+        set_output_type(0, m_output_type[0], out_ps);
     }
 
     if (m_output_type[1] == ov::element::undefined) {
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp
index 21d8fbbe6e298f..755330bd850c4d 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp
@@ -51,16 +51,14 @@ void attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input,
                         const ov::intel_cpu::PlainTensor& past_k_output,
                         const ov::intel_cpu::PlainTensor& past_v_output) {
     // For compatibility, all input_kvs are permuted to BHLS
-    size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3];
-    // Internal LBHS layout has strides[L] > strides[B]
-    assert(past_k_output.m_strides[2] >= past_k_output.m_strides[0]);
+    size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3];
     parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) {
         attn_copy(past_k_output.ptr<T2>(b, h, m, 0),
                   k_input.ptr<T>(b, h, m, 0),
                   S);
         attn_copy(past_v_output.ptr<T2>(b, h, m, 0),
                   v_input.ptr<T>(b, h, m, 0),
-                  S);
+                  SV);
     });
 }
 
@@ -69,16 +67,14 @@ static void attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input,
                                const ov::intel_cpu::PlainTensor& past_k_output,
                                const ov::intel_cpu::PlainTensor& past_v_output) {
     // For compatibility, all input_kvs are permuted to BHLS
-    size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3];
-    // Internal LBHS layout has strides[L] > strides[B]
-    assert(past_k_output.m_strides[2] >= past_k_output.m_strides[0]);
+    size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3];
     parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) {
         std::memcpy(past_k_output.ptr_v(b, h, m, 0),
                     k_input.ptr_v(b, h, m, 0),
                     S * k_input.m_element_size);
         std::memcpy(past_v_output.ptr_v(b, h, m, 0),
                     v_input.ptr_v(b, h, m, 0),
-                    S * v_input.m_element_size);
+                    SV * v_input.m_element_size);
     });
 }
 
@@ -88,7 +84,7 @@ static void paged_attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input,
                                      const ov::intel_cpu::PlainTensor& past_k_output,
                                      const ov::intel_cpu::PlainTensor& past_v_output,
                                      const ov::intel_cpu::PlainTensor& slot_mapping) {
-    size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3];
+    size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3];
     size_t block_size = past_k_output.m_dims[2];
     parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) {
         auto slot = slot_mapping.ptr<int32_t>(b)[m];
@@ -100,7 +96,7 @@ static void paged_attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input,
                   S);
         attn_copy(past_v_output.ptr<T2>(block_number, h, block_offset, 0),
                   v_input.ptr<T>(b, h, m, 0),
-                  S);
+                  SV);
     });
 }
 
@@ -109,7 +105,7 @@ static void paged_attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input,
                                      const ov::intel_cpu::PlainTensor& past_k_output,
                                      const ov::intel_cpu::PlainTensor& past_v_output,
                                      const ov::intel_cpu::PlainTensor& slot_mapping) {
-    size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3];
+    size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3];
     size_t block_size = past_k_output.m_dims[2];
     parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) {
         auto slot = slot_mapping.ptr<int32_t>(b)[m];
@@ -121,7 +117,7 @@ static void paged_attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input,
                     S * k_input.m_element_size);
         std::memcpy(past_v_output.ptr_v(block_number, h, block_offset, 0),
                     v_input.ptr_v(b, h, m, 0),
-                    S * v_input.m_element_size);
+                    SV * v_input.m_element_size);
     });
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp
index d95f973fa9f2f0..66772bda03db51 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp
@@ -178,8 +178,7 @@ static void attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src,
                           const ov::intel_cpu::PlainTensor& k_scale_zp,
                           const ov::intel_cpu::PlainTensor& v_scale_zp) {
     // For compatibility, all input_kvs are permuted to BHLS
-    size_t B = k_src.m_dims[0], H = k_src.m_dims[1], L1 = k_src.m_dims[2], S = k_src.m_dims[3];
-    // Internal LBHS layout has strides[L] > strides[B]
+    size_t B = k_src.m_dims[0], H = k_src.m_dims[1], L1 = k_src.m_dims[2], S = k_src.m_dims[3], SV = v_src.m_dims[3];
     parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) {
         auto p_k = k_scale_zp.ptr<float>(m, b, h);
         auto p_v = v_scale_zp.ptr<float>(m, b, h);
@@ -190,7 +189,7 @@ static void attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src,
                  p_k[1]);
         quant_u8(v_src.ptr<T>(b, h, m),
                  v_dst.ptr<T2>(b, h, m),
-                 S,
+                 SV,
                  p_v[0],
                  p_v[1]);
     });
@@ -202,7 +201,7 @@ static void paged_attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src,
                                 const ov::intel_cpu::PlainTensor& k_dst,
                                 const ov::intel_cpu::PlainTensor& v_dst,
                                 const ov::intel_cpu::PlainTensor& slot_mapping) {
-    size_t B = k_src.m_dims[0], H = k_src.m_dims[1], L1 = k_src.m_dims[2], S = k_src.m_dims[3];
+    size_t B = k_src.m_dims[0], H = k_src.m_dims[1], L1 = k_src.m_dims[2], S = k_src.m_dims[3], SV = v_src.m_dims[3];
     size_t block_size = k_dst.m_dims[2];
     parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) {
         auto slot = slot_mapping.ptr<int32_t>(b)[m];
@@ -221,7 +220,7 @@ static void paged_attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src,
                  p_k[1]);
         quant_u8(v_src.ptr<T>(b, h, m),
                  v_dst.ptr<T2>(block_number, h, block_offset) + sizeof(float) + sizeof(float),
-                 S,
+                 SV,
                  p_v[0],
                  p_v[1]);
     });
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
index 971aa6bb58c994..bef34881ca41bc 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
@@ -708,14 +708,38 @@ static void pack_32x16_kernel(T* dst, T* src, size_t dst_stride, size_t src_stri
 }
 
 template<typename T, typename = typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
-static void pack_32Nx16K(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+static void pack_32xK_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride, size_t K) {
+    static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+    auto midx = _mm512_loadu_si512(idx);
+    __mmask16 mask = (1 << K) - 1;
+    for (size_t i = 0; i < K; i++) {
+        auto x = _mm256_maskz_loadu_epi16(mask, src);                              // [a1  a2  a3 a4]   total 256-bits in 4 64bits unit
+        auto y = _mm256_maskz_loadu_epi16(mask, src + src_stride);                 // [b1  b2  b3 b4]   total 256-bits
+        auto a = _mm512_castsi256_si512(x);
+        auto b = _mm512_castsi256_si512(y);
+        a = _mm512_permutexvar_epi64(midx, a);                                      // [a1 x | a2 x | a3 x | a4 x]
+        b = _mm512_permutexvar_epi64(midx, b);                                      // [b1 x | b2 x | b3 x | b4 x]
+        auto B0 = _mm512_unpacklo_epi16(a, b);
+        _mm512_mask_storeu_epi32(dst, mask, B0);
+        src += 2 * src_stride;
+        dst += 2 * dst_stride;
+    }
+}
+
+template<typename T, typename = typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
+static void pack_32NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     for (size_t n = 0; n < N; n += 32) {
         size_t k = 0;
         for (; k + 32 <= K; k += 32) {
             pack_32x32_kernel(dst + k * 2, src + k, dst_stride, src_stride);
         }
-        if (k < K)
+        if (k + 16 <= K) {
             pack_32x16_kernel(dst + k * 2, src + k, dst_stride, src_stride);
+            k += 16;
+        }
+        if (k < K) {
+            pack_32xK_kernel(dst + k * 2, src + k, dst_stride, src_stride, K - k);
+        }
 
         dst += 32 * dst_stride;
         src += 32 * src_stride;
@@ -723,7 +747,7 @@ static void pack_32Nx16K(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_
 }
 
 template<typename T, typename = typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
-static void pack_32Nx16K(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+static void pack_32NxK(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     // The layout for per token per head:
     // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)|
     // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float)
@@ -735,14 +759,14 @@ static void pack_32Nx16K(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_
         s += src_stride + 2 * sizeof(float);
         t += src_stride;
     }
-    pack_32Nx16K(dst, tmp, reinterpret_cast<T*>(0), N, K, dst_stride, src_stride);
+    pack_32NxK(dst, tmp, reinterpret_cast<T*>(0), N, K, dst_stride, src_stride);
 }
 #endif
 
 template<typename T>
-static void pack_32Nx16K(float* dst, T* src, float* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+static void pack_32NxK(float* dst, T* src, float* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     // never called
-    OPENVINO_THROW("pack_32Nx16K: should not be called.");
+    OPENVINO_THROW("pack_32NxK: should not be called.");
 }
 
 template <typename DATA_TYPE, typename KVCACHE_TYPE>
@@ -750,6 +774,7 @@ struct MHAHelper {
     // initialize once
     size_t _H;
     size_t _S;
+    size_t _SV;
     size_t _Hk;
     size_t _h_each_group_len;
     size_t _block_size;
@@ -785,7 +810,7 @@ struct MHAHelper {
         _weight.resize<float>({size_t{1}, size_t{1}, size_t{1}, size_t{1}});
     }
 
-    void init(size_t H, size_t S, size_t Hk, size_t h_each_group_len, size_t block_size, size_t sliding_window,
+    void init(size_t H, size_t S, size_t SV, size_t Hk, size_t h_each_group_len, size_t block_size, size_t sliding_window,
               float d_scale, size_t kv_len, bool init_alibi_lookup) {
         // query shape: [B, H, L, S]
         // present_key shape: [block, H, 32, S]
@@ -799,6 +824,7 @@ struct MHAHelper {
         auto in_type = precision_of<DATA_TYPE>::value;
         _H = H;
         _S = S;
+        _SV = SV;
         _Hk = Hk;
         _h_each_group_len = h_each_group_len;
         _block_size = block_size;
@@ -811,7 +837,7 @@ struct MHAHelper {
         auto new_score_stride = std::max(prev_score_stride, want_score_stride);
         // resize temporary buffers, weight.size(3) will be aligned to block_size
         _weight.resize<float>({static_cast<size_t>(_nthr), H, _block_size, new_score_stride});
-        _output.resize<float>({static_cast<size_t>(_nthr), _block_size, H, S});
+        _output.resize<float>({static_cast<size_t>(_nthr), _block_size, H, SV});
 
         // TODO: kernel supports stride
         if (_qk_gemm.empty() || prev_score_stride < new_score_stride) {
@@ -828,20 +854,20 @@ struct MHAHelper {
                                                              false,
                                                              in_type);
                 _wv_gemm[i] = std::make_shared<BrgemmKernel>(i + 1,
-                                                             _S,
+                                                             _SV,
                                                              _block_size,
                                                              // if it's bf16, the stride needs double due to reuse float buffer
                                                              (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2),
-                                                             _S,
+                                                             _SV,
                                                              _output.stride(1),
                                                              false,
                                                              in_type);
                 _wv_gemm_acc[i] = std::make_shared<BrgemmKernel>(i + 1,
-                                                                 _S,
+                                                                 _SV,
                                                                  _block_size,
                                                                  // if it's bf16, the stride needs double due to reuse float buffer
                                                                  (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2),
-                                                                 _S,
+                                                                 _SV,
                                                                  _output.stride(1),
                                                                  false,
                                                                  in_type,
@@ -881,7 +907,7 @@ struct MHAHelper {
 
     void init_reorder_buffers(size_t batch, size_t kv_len_in_blocks) {
         _qk_scratch_b.resize<DATA_TYPE>({batch, kv_len_in_blocks, _Hk, _block_size * _S});
-        _wv_scratch_b.resize<DATA_TYPE>({batch, kv_len_in_blocks, _Hk, _block_size * rnd_up(_S, _block_size)});
+        _wv_scratch_b.resize<DATA_TYPE>({batch, kv_len_in_blocks, _Hk, _block_size * rnd_up(_SV, _block_size)});
     }
 
     void init_score_buffers(const PlainTensor& past_lens, const PlainTensor& subsequence_begins) {
@@ -992,7 +1018,7 @@ struct MHAHelper {
 
             // reuse float buffer, need to use float to compute offset
             auto* w_ptr = reinterpret_cast<DATA_TYPE*>(_weight.ptr<float>(ithr, h, 0, 0));
-            float* fp32_out_ptr = q_is_xf16 ? _output.ptr<float>(ithr, 0, h, 0) : output_emb.ptr<float>(q_start, h * _S);
+            float* fp32_out_ptr = q_is_xf16 ? _output.ptr<float>(ithr, 0, h, 0) : output_emb.ptr<float>(q_start, h * _SV);
 
             // for each weight block, loop through all value block
             for (size_t v_blk = 0; v_blk < cur_kv_len_blocks; v_blk++) {
@@ -1020,12 +1046,12 @@ struct MHAHelper {
             }
             if (q_is_xf16) {
                 attn_memcpy2d_kernel(_output.ptr<float>(ithr, 0, h, 0),
-                                     output_emb.ptr<DATA_TYPE>(q_start, h * _S),
+                                     output_emb.ptr<DATA_TYPE>(q_start, h * _SV),
                                      ov::element::f32,
                                      precision_of<DATA_TYPE>::value,
                                      _output.stride(1),
                                      output_emb.stride(0),
-                                     _S,
+                                     _SV,
                                      q_cnt);
             }
         }
@@ -1091,7 +1117,7 @@ struct MHAHelper {
             }
         }
 
-        memset(_output.ptr<float>(ithr), 0, q_len * _H * _S * sizeof(float));
+        memset(_output.ptr<float>(ithr), 0, q_len * _H * _SV * sizeof(float));
         for (size_t pv = 0, i = 0; pv < cur_kv_len; pv += _block_size, i++) {
             auto block_number = block_table[i];
             auto* v = present_value.ptr<KVCACHE_TYPE>(block_number, hk);
@@ -1100,7 +1126,7 @@ struct MHAHelper {
                     attn_acc_value_block(_output.ptr<float>(ithr, pq, h),
                                          _weight.ptr<float>(ithr, h, pq) + pv,
                                          v,
-                                         _S,
+                                         _SV,
                                          std::min(_block_size, cur_kv_len - pv));
                 }
             }
@@ -1108,7 +1134,7 @@ struct MHAHelper {
         // convert to dst
         for (size_t pq = 0; pq < q_len; pq++)
             for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++)
-                cvt_copy(output_emb.ptr<DATA_TYPE>(pq, h * _S), _output.ptr<float>(ithr, pq, h), _S);
+                cvt_copy(output_emb.ptr<DATA_TYPE>(pq, h * _SV), _output.ptr<float>(ithr, pq, h), _SV);
     }
 
     // compute one token, loop along batch, head dimensions and kv_len, it's special for very long kv_len with small batch tokens.
@@ -1197,7 +1223,7 @@ struct MHAHelper {
         }
 
         // attn_w * V
-        _output_bhl.resize<float>({static_cast<size_t>(_nthr), B, q_len, _H, _S});
+        _output_bhl.resize<float>({static_cast<size_t>(_nthr), B, q_len, _H, _SV});
         // m_attn_w {B, H, q_len, kv_len}
         parallel_nt_static(_nthr, [&](const size_t ithr, const size_t nthr) {
             memset(_output_bhl.ptr<float>(ithr, 0, 0, 0, 0), 0, _output_bhl.stride(0) * sizeof(float));
@@ -1216,7 +1242,7 @@ struct MHAHelper {
                         attn_acc_value_block(_output_bhl.ptr<float>(ithr, b, pq, h),
                                              _weight_bhl.ptr<float>(b, h, pq) + pv,
                                              v,
-                                             _S,
+                                             _SV,
                                              std::min(_block_size, context_len - pv));
                     }
                 }
@@ -1226,8 +1252,8 @@ struct MHAHelper {
         parallel_for3d(B, _H, q_len, [&](size_t b, size_t h, size_t pq) {
             auto* temp = _output_bhl.ptr<float>(0, b, pq, h);
             size_t temp_stride = _output_bhl.stride(0);
-            auto* dst = output_emb.ptr<DATA_TYPE>(b, pq, h * _S);
-            attn_reduce(dst, temp, _nthr, _S, temp_stride);
+            auto* dst = output_emb.ptr<DATA_TYPE>(b, pq, h * _SV);
+            attn_reduce(dst, temp, _nthr, _SV, temp_stride);
         });
     }
 };
@@ -1375,17 +1401,17 @@ struct MHA {
                 _helper._block_size,
                 _helper._S, _helper._block_size, _helper._S);
             if (q_is_xf16) {
-                pack_32Nx16K(_helper._wv_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
-                    v_ptr,
-                    _helper._output.template ptr<DATA_TYPE>(ithr),
-                    _helper._block_size,
-                    _helper._S,
-                    rnd_up(_helper._S, _helper._block_size),
-                    _helper._S);
+                pack_32NxK(_helper._wv_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
+                           v_ptr,
+                           _helper._output.template ptr<DATA_TYPE>(ithr),
+                           _helper._block_size,
+                           _helper._SV,
+                           rnd_up(_helper._SV, _helper._block_size),
+                           _helper._SV);
             } else {
                 // need to decompress
                 if (!q_cache_is_same) {
-                    dequant(_helper._wv_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk), v_ptr, _helper._block_size, _helper._S);
+                    dequant(_helper._wv_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk), v_ptr, _helper._block_size, _helper._SV);
                 }
             }
         });
@@ -1429,7 +1455,7 @@ struct MHA {
                 sub_query = sub_query.permute({1, 0, 2});
                 _helper.exec_kernel_multiple(sub_query,
                     v_cache,
-                    output_emb.slice(0, batch_in_token, batch_in_token + q_len).reshape({q_len, _helper._H * _helper._S}),
+                    output_emb.slice(0, batch_in_token, batch_in_token + q_len).reshape({q_len, _helper._H * _helper._SV}),
                     _helper._qk_scratch_b.slice(0, batch_in_reorder, batch_in_reorder),
                     _helper._wv_scratch_b.slice(0, batch_in_reorder, batch_in_reorder),
                     block_indices.ptr<int32_t>() + block_indices_begins.ptr<int32_t>()[batch_in_seq],
@@ -1518,7 +1544,8 @@ struct AttentionExecutor : public PagedAttentionExecutor {
         // The layout for per token per head for u8 kv cache:
         // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)|
         // The actual size needs to deduct scale and zeropoint.
-        auto S = v_cache.size(3) - (k_cache.m_dt == ov::element::Type_t::u8 ? sizeof(float) * 2 : 0);
+        auto S = k_cache.size(3) - (k_cache.m_dt == ov::element::Type_t::u8 ? sizeof(float) * 2 : 0);
+        auto SV = v_cache.size(3) - (k_cache.m_dt == ov::element::Type_t::u8 ? sizeof(float) * 2 : 0);
         auto block_size = k_cache.size(2);
         auto H = q.size(1) / S;
         auto h_each_group_len = 1;
@@ -1529,16 +1556,16 @@ struct AttentionExecutor : public PagedAttentionExecutor {
 
         q.assert_dims({B_token, H * S});
         k.assert_dims({B_token, Hk * S});
-        v.assert_dims({B_token, Hk * S});
+        v.assert_dims({B_token, Hk * SV});
         q = q.reshape({B_token, H, 1, S});
         k = k.reshape({B_token, Hk, 1, S});
-        v = v.reshape({B_token, Hk, 1, S});
+        v = v.reshape({B_token, Hk, 1, SV});
         if (k_cache.m_dt == ov::element::Type_t::u8) {
             k_cache.assert_dims({0, Hk, block_size, S + sizeof(float) * 2}, true);
-            v_cache.assert_dims({k_cache.m_dims[0], Hk, block_size, S + sizeof(float) * 2});
+            v_cache.assert_dims({k_cache.m_dims[0], Hk, block_size, SV + sizeof(float) * 2});
         } else {
             k_cache.assert_dims({0, Hk, block_size, S}, true);
-            v_cache.assert_dims({k_cache.m_dims[0], Hk, block_size, S});
+            v_cache.assert_dims({k_cache.m_dims[0], Hk, block_size, SV});
         }
         past_lens.assert_dims({B_seq});
         subsequence_begins.assert_dims({B_seq + 1});
@@ -1549,14 +1576,13 @@ struct AttentionExecutor : public PagedAttentionExecutor {
         if (alibi_slopes) {
             alibi_slopes.assert_dims({H});
         }
-        output_emb.assert_dims({B_token, H * S});
-        output_emb = output_emb.reshape({B_token, 1, H * S});
+        output_emb.assert_dims({B_token, H * SV});
+        output_emb = output_emb.reshape({B_token, 1, H * SV});
 
         // TODO: enable block_size to be multiple of 32
         OPENVINO_ASSERT(block_size == 32, "CPU: block size must be 32, current: ", block_size);
-        OPENVINO_ASSERT(S % 16 == 0, "CPU: head size must be multiple of 16, current: ", S);
 
-        _helper.init(H, S, Hk, h_each_group_len, block_size, sliding_window, scale, max_context_len, alibi_slopes);
+        _helper.init(H, S, SV, Hk, h_each_group_len, block_size, sliding_window, scale, max_context_len, alibi_slopes);
     }
 
     void concat_pastkv(const PlainTensor& k, const PlainTensor& v, const PlainTensor& k_cache, const PlainTensor& v_cache,
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp
index 0670c744a6da91..1543c168403382 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp
@@ -861,6 +861,7 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
     auto H = query.size(1);
     auto q_len = query.size(2);
     auto S = query.size(3);
+    auto SV = present_value.size(3);
     auto h_group_num = present_value.size(1);
     auto precision = ov::element::f32;
     if (std::is_same<T3, ov::float16>::value) {
@@ -991,10 +992,10 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
     // attn_w * V
     // Fast Path if there are enough works for each thread
     if (B >= static_cast<size_t>(nthr)) {
-        buf_attn_score.resize<T3>({static_cast<size_t>(nthr), q_len, h_each_group_len, S});
+        buf_attn_score.resize<T3>({static_cast<size_t>(nthr), q_len, h_each_group_len, SV});
         parallel_for2d(B, h_group_num, [&](size_t b, size_t h_group) {
             auto ithr = parallel_get_thread_num();
-            memset(buf_attn_score.ptr<T3>(ithr), 0, q_len * h_each_group_len * S * sizeof(T3));
+            memset(buf_attn_score.ptr<T3>(ithr), 0, q_len * h_each_group_len * SV * sizeof(T3));
             for (size_t pv = 0; pv < kv_len; pv++) {
                 auto b_kv = beams ? beams.ptr<int32_t>(b)[pv] : b;
                 auto* v = present_value.ptr<T2>(b_kv, h_group, pv);
@@ -1004,7 +1005,7 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
                         attn_acc_value(buf_attn_score.ptr<T3>(ithr, pq, group_idx),
                                        buf_attn_w.ptr<T3>(b, h, pq)[pv],
                                        v,
-                                       S,
+                                       SV,
                                        p + 0,
                                        p + 1);
                     }
@@ -1014,15 +1015,15 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
             for (size_t pq = 0; pq < q_len; pq++) {
                 for (size_t h = h_group * h_each_group_len, group_idx = 0; h < (h_group + 1) * h_each_group_len;
                         h++, group_idx++) {
-                    auto* dst = has_out_transpose ? output_emb.ptr<T>(b, pq, h * S) : output_emb.ptr<T>(b, h, pq);
-                    cvt_copy(dst, buf_attn_score.ptr<T3>(ithr, pq, group_idx), S);
+                    auto* dst = has_out_transpose ? output_emb.ptr<T>(b, pq, h * SV) : output_emb.ptr<T>(b, h, pq);
+                    cvt_copy(dst, buf_attn_score.ptr<T3>(ithr, pq, group_idx), SV);
                 }
             }
         });
         return;
     }
 
-    buf_attn_score.resize<T3>({static_cast<size_t>(nthr), B, q_len, H, S});
+    buf_attn_score.resize<T3>({static_cast<size_t>(nthr), B, q_len, H, SV});
     // buf_attn_w {B, H, q_len, kv_len}
     parallel_nt_static(nthr, [&](const size_t ithr, const size_t nthr) {
         size_t start{0}, end{0};
@@ -1041,7 +1042,7 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
                     attn_acc_value(buf_attn_score.ptr<T3>(ithr, b, 0, h_group),
                                    buf_attn_w.ptr<T3>(b, h_group, 0, pv)[0],
                                    v,
-                                   S,
+                                   SV,
                                    p + 0,
                                    p + 1);
                     parallel_it_step(pv, kv_len, b, B, h_group, h_group_num);
@@ -1056,7 +1057,7 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
                             attn_acc_value(buf_attn_score.ptr<T3>(ithr, b, pq, h),
                                            buf_attn_w.ptr<T3>(b, h, pq)[pv],
                                            v,
-                                           S,
+                                           SV,
                                            p + 0,
                                            p + 1);
                         }
@@ -1070,8 +1071,8 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
     parallel_for3d(B, H, q_len, [&](size_t b, size_t h, size_t pq) {
         auto* temp = buf_attn_score.ptr<T3>(0, b, pq, h);
         size_t temp_stride = buf_attn_score.stride(0);
-        auto* dst = has_out_transpose ? output_emb.ptr<T>(b, pq, h * S) : output_emb.ptr<T>(b, h, pq);
-        attn_reduce(dst, temp, nthr, S, temp_stride);
+        auto* dst = has_out_transpose ? output_emb.ptr<T>(b, pq, h * SV) : output_emb.ptr<T>(b, h, pq);
+        attn_reduce(dst, temp, nthr, SV, temp_stride);
     });
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
index 6bf7d3099a85d9..b9666388490f74 100644
--- a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
@@ -152,18 +152,32 @@ void PagedAttention::execute(dnnl::stream strm) {
         inputs[i] = getSrcMemoryAtPort(i);
     }
 
-    const auto& queryDims = inputs[0]->getStaticDims();
+    auto outDims = inputs[0]->getStaticDims();
+    const auto& keyDims = inputs[1]->getStaticDims();
+    const auto& valueDims = inputs[2]->getStaticDims();
+    // value head_size may be not same with key
+    if (keyDims[1] != valueDims[1]) {
+        // The outDims[1] should be `num_heads * v_head_size`, it can be got from:
+        // because:
+        //   q: query_ps[1] = num_heads * head_size
+        //   k: key_ps[1] = num_kv_heads * head_size
+        //   v: value_ps[1] = num_kv_heads * v_head_size
+        // therefore:
+        //   q * v / k = (num_heads * head_size) * (num_kv_heads * v_head_size) /
+        //               (num_kv_heads * head_size) = num_heads * v_head_size
+        outDims[1] = outDims[1] * valueDims[1] / keyDims[1];
+    }
     if (m_hasScore) {
         size_t len = 0;
         const auto& pastLensDims = inputs[5]->getStaticDims();
         auto pastLens = inputs[5]->getDataAs<const int32_t>();
         for (size_t i = 0; i < pastLensDims[0]; i++)
             len += pastLens[i];
-        len += queryDims[0];
+        len += outDims[0];
         VectorDims scoreDims{len};
-        redefineOutputMemory({queryDims, scoreDims});
+        redefineOutputMemory({outDims, scoreDims});
     } else {
-        redefineOutputMemory(0, queryDims);
+        redefineOutputMemory(0, outDims);
     }
 
     outputs[0] = getDstMemoryAtPort(0);
diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
index eecba2acff260b..e70a3932b11b1e 100644
--- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
@@ -135,6 +135,7 @@ struct MHAKernel {
         auto H = query.size(1);
         auto q_len = query.size(2);
         auto head_size = query.size(3);
+        auto head_size_v = present_value.size(3);
         auto kv_len = present_key.size(2);
         auto Hk = present_key.size(1);
         size_t h_each_group_len = H / Hk;
@@ -145,7 +146,7 @@ struct MHAKernel {
 
         parallel_for2d(B, H, [&](size_t b, size_t h) {
             std::vector<float> attn_score(kv_len);
-            std::vector<float> word_vec(head_size, 0.0f);
+            std::vector<float> word_vec(head_size_v, 0.0f);
 
             for (size_t m = 0; m < q_len; m++) {
                 // dot-product to get attention scores
@@ -185,14 +186,14 @@ struct MHAKernel {
                 softmax(&attn_score[0], ncausal);
 
                 // linearly combine value
-                word_vec.assign(head_size, 0.0f);
+                word_vec.assign(head_size_v, 0.0f);
                 for (size_t n = 0; n < ncausal; n++) {
                     auto* v = &present_value.at<T>({b, h / h_each_group_len, n, 0}, true);
-                    accumulate(word_vec.data(), v, head_size, attn_score[n]);
+                    accumulate(word_vec.data(), v, head_size_v, attn_score[n]);
                 }
 
                 // output [B, L1, H*head_size]
-                auto* out = has_out_transpose ? &output_emb.at<T>({b, m, h * head_size}) : &output_emb.at<T>({b, h, m});
+                auto* out = has_out_transpose ? &output_emb.at<T>({b, m, h * head_size_v}) : &output_emb.at<T>({b, h, m});
                 std::copy(word_vec.begin(), word_vec.end(), out);
             }
         });
@@ -259,13 +260,14 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
         return dnnl_dims;
     }
 
-    void prepare_brgemm_prim(dnnl::stream strm, PlainTensor& query, PlainTensor& present_key, bool has_out_transpose) {
+    void prepare_brgemm_prim(dnnl::stream strm, PlainTensor& query, PlainTensor& present_key, PlainTensor& present_value, bool has_out_transpose) {
         auto in_type = precision_of<T>::value;
         auto qkv_dt = DnnlExtensionUtils::ElementTypeToDataType(in_type);
         auto B = query.size(0);
         auto H = query.size(1);
         auto q_len = query.size(2);
         auto head_size = query.size(3);
+        auto head_size_v = present_value.size(3);
         auto kv_len = present_key.size(2);
         auto Hk = present_key.size(1);
         brgemmKey qk_key = {q_len, kv_len, head_size, query.stride(2), present_key.stride(2), kv_len, true, in_type};
@@ -289,19 +291,19 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
 
         qk_gemm_ptr = qk_result.first;
         if (has_out_transpose)
-            out_md = dnnl::memory::desc(make_dnnl_dims({B, q_len, H, head_size}), qkv_dt, tag::abcd);
+            out_md = dnnl::memory::desc(make_dnnl_dims({B, q_len, H, head_size_v}), qkv_dt, tag::abcd);
         else
-            out_md = dnnl::memory::desc(make_dnnl_dims({B, H, q_len, head_size}), qkv_dt, tag::abcd);
+            out_md = dnnl::memory::desc(make_dnnl_dims({B, H, q_len, head_size_v}), qkv_dt, tag::abcd);
 
         size_t ldc_index = 2;
         if (has_out_transpose) {
             ldc_index = 1;
         }
         brgemmKey wv_key = {q_len,
-                            head_size,
+                            head_size_v,
                             kv_len,
                             kv_len * (in_type == ov::element::Type_t::f32 ? 1 : 2),
-                            present_key.stride(2),
+                            present_value.stride(2),
                             static_cast<size_t>(out_md.get_strides()[ldc_index]),
                             false,
                             in_type};
@@ -329,9 +331,9 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
         const size_t m_block_size = qk_gemm_ptr->get_mblk_size();
         weight_score.resize<float>({static_cast<size_t>(parallel_get_max_threads()), H, m_block_size, kv_len});
         if (has_out_transpose) {
-            fp32_out.resize<float>({B, q_len, H, head_size});
+            fp32_out.resize<float>({B, q_len, H, head_size_v});
         } else {
-            fp32_out.resize<float>({B, H, q_len, head_size});
+            fp32_out.resize<float>({B, H, q_len, head_size_v});
         }
         return;
     }
@@ -348,7 +350,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
         const auto B = query.size(0);
         const auto H = query.size(1);
         const auto q_len = query.size(2);
-        const auto head_size = query.size(3);
+        const auto head_size_v = present_value.size(3);
         const auto Hk = present_key.size(1);
         const auto kv_len = present_key.size(2);
         size_t h_each_group_len = H / Hk;
@@ -423,7 +425,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
             if (is_xf16) {
                 fp32_out_ptr = has_out_transpose ? &fp32_out.at<float>({b, m_start, h, 0}) : &fp32_out.at<float>({b, h, m_start, 0});
             } else {
-                fp32_out_ptr = has_out_transpose ? &output_emb.at<float>({b, m_start, h * head_size}) : &output_emb.at<float>({b, h, m_start, 0});
+                fp32_out_ptr = has_out_transpose ? &output_emb.at<float>({b, m_start, h * head_size_v}) : &output_emb.at<float>({b, h, m_start, 0});
             }
             T* v_ptr = is_xf16 ? &wv_scratch_b.at<T>({b, h / h_each_group_len, 0})
                                : &present_value.at<T>({b, h / h_each_group_len, 0, 0});
@@ -436,12 +438,12 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
             if (is_xf16) {
                 if (has_out_transpose) {
                     attn_memcpy2d_kernel(&fp32_out.at<float>({b, m_start, h, 0}),
-                                         &output_emb.at<T>({b, m_start, h * head_size}),
+                                         &output_emb.at<T>({b, m_start, h * head_size_v}),
                                          ov::element::f32,
                                          precision_of<T>::value,
                                          fp32_out.stride(1),
                                          output_emb.stride(1),
-                                         head_size,
+                                         head_size_v,
                                          m_cnt);
                 } else {
                     attn_memcpy2d_kernel(&fp32_out.at<float>({b, h, m_start, 0}),
@@ -450,7 +452,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
                                          precision_of<T>::value,
                                          0,
                                          0,
-                                         m_cnt * head_size,
+                                         m_cnt * head_size_v,
                                          1);
                 }
             }
@@ -485,7 +487,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
         if (d_scale == 0.0f)
             d_scale = 1.0f / sqrt(head_size);
 
-        prepare_brgemm_prim(strm, query, present_key, has_out_transpose);
+        prepare_brgemm_prim(strm, query, present_key, present_value, has_out_transpose);
         execute_brgemm(query,
                        present_key,
                        present_value,
@@ -540,6 +542,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_ACL, T> {
         auto H = query.size(1);
         auto q_len = query.size(2);
         auto head_size = query.size(3);
+        auto head_size_v = present_value.size(3);
         auto kv_len = present_key.size(2);
         auto h_group_num = present_key.size(1);
         size_t h_each_group_len = H / h_group_num;
@@ -620,9 +623,9 @@ struct MHAKernel<ScaledDotProductAttention::KT_ACL, T> {
             arm_compute::TensorInfo outInfo;
             arm_compute::Tensor outTensor;
 
-            auto out = has_out_transpose ? &output_emb.at<T>({b, m_start, h * head_size}) : &output_emb.at<T>({b, h, m_start});
+            auto out = has_out_transpose ? &output_emb.at<T>({b, m_start, h * head_size_v}) : &output_emb.at<T>({b, h, m_start});
             auto strides = arm_compute::Strides({output_emb.stride_bytes(1), output_emb.stride_bytes(2)});
-            GemmKernel out_gemm(m_cnt, kv_len, head_size, false, precision);
+            GemmKernel out_gemm(m_cnt, kv_len, head_size_v, false, precision);
 
             arm_compute::Strides vStrides({present_value.stride_bytes(3), present_value.stride_bytes(2)});
             out_gemm.executeGemm(qkTensor.buffer(),
@@ -685,6 +688,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_MLAS, float> {
         auto H = query.size(1);
         auto q_len = query.size(2);
         auto head_size = query.size(3);
+        auto head_size_v = present_value.size(3);
         auto kv_len = present_key.size(2);
         auto h_group_num = present_key.size(1);
         size_t h_each_group_len = H / h_group_num;
@@ -786,7 +790,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_MLAS, float> {
             mlas_sgemm("N",
                        "N",
                        m_cnt,
-                       head_size,
+                       head_size_v,
                        kv_len,
                        1.0f,
                        qk,
@@ -794,7 +798,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_MLAS, float> {
                        v_ptr,
                        present_value.stride(2),
                        0.f,
-                       has_out_transpose ? &output_emb.at<float>({b, m_start, h * head_size}) : &output_emb.at<float>({b, h, m_start}),
+                       has_out_transpose ? &output_emb.at<float>({b, m_start, h * head_size_v}) : &output_emb.at<float>({b, h, m_start}),
                        has_out_transpose ? output_emb.stride(1) : output_emb.stride(2),
                        1);
         });
@@ -875,7 +879,7 @@ struct ScaledDotProductAttention::AttentionExecutor : public ScaledDotProductAtt
         PlainTensor attn_mask;
         PlainTensor output_emb(output);
         float scale_input = 0.0f;
-        size_t B, L1, L0, S;
+        size_t B, L1, L0, S, SV;
 
         q_input.reset(inputs[0]);
         k_input.reset(inputs[1]);
@@ -911,18 +915,19 @@ struct ScaledDotProductAttention::AttentionExecutor : public ScaledDotProductAtt
         B = q_input.size(0);
         L1 = q_input.size(2);
         S = q_input.size(3);
+        SV = v_input.size(3);
         L0 = present_key.size(2) - L1;
         auto Hk = k_input.size(1);
 
         if (fuse_concat) {
             k_input.assert_dims({B, Hk, L1, S});
-            v_input.assert_dims({B, Hk, L1, S});
+            v_input.assert_dims({B, Hk, L1, SV});
         } else {
             k_input.assert_dims({B, Hk, L0 + L1, S});
-            v_input.assert_dims({B, Hk, L0 + L1, S});
+            v_input.assert_dims({B, Hk, L0 + L1, SV});
         }
         present_key.assert_dims({B, Hk, L0 + L1, S});
-        present_value.assert_dims({B, Hk, L0 + L1, S});
+        present_value.assert_dims({B, Hk, L0 + L1, SV});
         if (beam_table)
             beam_table.assert_dims({B, L0 + L1});
 
@@ -1222,6 +1227,7 @@ void ScaledDotProductAttention::resetBeamTablePastkv(const MemoryPtr& mem_cur_k,
     auto H = cur_k.size(1);
     auto L1 = cur_k.size(2);
     auto S = cur_k.size(3);
+    auto SV = cur_v.size(3);
     auto reverse = [&order] (const std::vector<size_t>& cur) {
         std::vector<size_t> result(cur.size());
         for (size_t i = 0; i < cur.size(); i++) {
@@ -1244,12 +1250,17 @@ void ScaledDotProductAttention::resetBeamTablePastkv(const MemoryPtr& mem_cur_k,
         // BHLS is the stated input shape of SDPA, however internally we use LBHS for KV-cache storage.
         // real_order is used to permute the original shape to LBHS
         std::vector<size_t> shape = reverse({B, H, (L0 + L1) * 2, S});
-        auto mem_desc = std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
-                                                               Shape(shape),
-                                                               permute_axes(shape, real_order),
-                                                               real_order);
-        auto new_internal_mem_k = std::make_shared<Memory>(getEngine(), mem_desc);
-        auto new_internal_mem_v = std::make_shared<Memory>(getEngine(), mem_desc);
+        auto mem_desc_k = std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
+                                                                 Shape(shape),
+                                                                 permute_axes(shape, real_order),
+                                                                 real_order);
+        auto new_internal_mem_k = std::make_shared<Memory>(getEngine(), mem_desc_k);
+        shape = reverse({B, H, (L0 + L1) * 2, SV});
+        auto mem_desc_v = std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
+                                                                 Shape(shape),
+                                                                 permute_axes(shape, real_order),
+                                                                 real_order);
+        auto new_internal_mem_v = std::make_shared<Memory>(getEngine(), mem_desc_v);
 
         PlainTensor new_pastk, new_pastv, old_past_k, old_past_v;
         new_pastk.reset(new_internal_mem_k);
@@ -1271,7 +1282,7 @@ void ScaledDotProductAttention::resetBeamTablePastkv(const MemoryPtr& mem_cur_k,
                        S * old_past_k.m_element_size);
                 memcpy(&new_pastv.at<char>({b, h, m}),
                        &old_past_v.at<char>({b_kv, h, m}),
-                       S * old_past_v.m_element_size);
+                       SV * old_past_v.m_element_size);
             });
         }
         if (kvcache_precision == ov::element::u8) {
@@ -1301,16 +1312,26 @@ void ScaledDotProductAttention::resetBeamTablePastkv(const MemoryPtr& mem_cur_k,
 
         std::vector<size_t> new_shape = reverse({B, H, (L0 + L1), S});
         // Get the shape of physical layout using real order
-        auto strides = mem_desc->getStrides();
-        mem_desc = std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
-                                                          Shape(new_shape),
-                                                          permute_axes(new_shape, real_order),
-                                                          real_order,
-                                                          0,
-                                                          VectorDims{},
-                                                          strides);
-        new_internal_mem_k->redefineDesc(mem_desc);
-        new_internal_mem_v->redefineDesc(mem_desc);
+        auto strides = mem_desc_k->getStrides();
+        mem_desc_k = std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
+                                                            Shape(new_shape),
+                                                            permute_axes(new_shape, real_order),
+                                                            real_order,
+                                                            0,
+                                                            VectorDims{},
+                                                            mem_desc_k->getStrides());
+        new_internal_mem_k->redefineDesc(mem_desc_k);
+        new_shape = reverse({B, H, (L0 + L1), SV});
+        // Get the shape of physical layout using real order
+        strides = mem_desc_v->getStrides();
+        mem_desc_v = std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
+                                                            Shape(new_shape),
+                                                            permute_axes(new_shape, real_order),
+                                                            real_order,
+                                                            0,
+                                                            VectorDims{},
+                                                            strides);
+        new_internal_mem_v->redefineDesc(mem_desc_v);
         if (kvcache_precision == ov::element::u8) {
             // past_k's shape is BHLS, internal layout LBHS
             // scale_zp's shape is LBHS, internal layout LBHS
@@ -1324,7 +1345,7 @@ void ScaledDotProductAttention::resetBeamTablePastkv(const MemoryPtr& mem_cur_k,
         m_k_state->assign_internal_state(new_internal_mem_k);
         m_v_state->assign_internal_state(new_internal_mem_v);
         m_k_state->assign_internal_state_max_size(B * H * (L0 + L1) * 2 * S);
-        m_v_state->assign_internal_state_max_size(B * H * (L0 + L1) * 2 * S);
+        m_v_state->assign_internal_state_max_size(B * H * (L0 + L1) * 2 * SV);
     }
     // 3. create beam table
     {
@@ -1534,6 +1555,7 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
     auto H = cur_k.size(1);
     auto L1 = cur_k.size(2);
     auto S = cur_k.size(3);
+    auto SV = cur_v.size(3);
     auto reverse = [&order] (const std::vector<size_t>& cur) {
         std::vector<size_t> result(cur.size());
         for (size_t i = 0; i < cur.size(); i++) {
@@ -1558,13 +1580,15 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
         // new_shape is the shape used by the original model which maybe different from BHLS, reverse here is to permute BHLS to original model shape.
         // BHLS is the stated input shape of SDPA, however internally we use LBHS for KV-cache storage.
         // real_order is used to permute the original shape to LBHS
-        std::vector<size_t> new_shape = reverse({B, H, (L0 + L1) * 2, S});
-        auto real_shape = permute_axes(new_shape, real_order);
-        auto mem_desc =
-            std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision, Shape(new_shape), real_shape, real_order);
+        auto new_memory = [&] (size_t new_S) {
+            std::vector<size_t> new_shape = reverse({B, H, (L0 + L1) * 2, new_S});
+            auto real_shape = permute_axes(new_shape, real_order);
+            auto mem_desc = std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision, Shape(new_shape), real_shape, real_order);
+            return std::make_shared<Memory>(getEngine(), mem_desc);
+        };
 
-        auto new_internal_mem_k = std::make_shared<Memory>(getEngine(), mem_desc);
-        auto new_internal_mem_v = std::make_shared<Memory>(getEngine(), mem_desc);
+        auto new_internal_mem_k = new_memory(S);
+        auto new_internal_mem_v = new_memory(SV);
 
         PlainTensor new_pastk, new_pastv;
         new_pastk.reset(new_internal_mem_k);
@@ -1585,7 +1609,7 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
         m_k_state->assign_internal_state(new_internal_mem_k);
         m_v_state->assign_internal_state(new_internal_mem_v);
         m_k_state->assign_internal_state_max_size(2 * (L0 + L1) * B * H * S);
-        m_v_state->assign_internal_state_max_size(2 * (L0 + L1) * B * H * S);
+        m_v_state->assign_internal_state_max_size(2 * (L0 + L1) * B * H * SV);
         if (kvcache_precision == ov::element::u8) {
             auto& old_scale_zp_k = m_k_state->get_scale_zp();
             auto& old_scale_zp_v = m_v_state->get_scale_zp();
@@ -1610,21 +1634,23 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
         // new_shape is the shape used by the original model which maybe different from BHLS, reverse here is to permute BHLS to original model shape.
         // BHLS is the stated input shape of SDPA, however internally we use LBHS for KV-cache storage.
         // real_order is used to permute the original shape to LBHS
-        std::vector<size_t> new_shape = reverse({B, H, (L0 + L1), S});
-        VectorDims strides(new_shape.size(), 1);
-        auto real_shape = permute_axes(new_shape, real_order);
-        for (size_t i = 2; i <= real_shape.size(); i++) {
-            strides[real_shape.size() - i] = strides[real_shape.size() - (i-1)] * real_shape[real_shape.size() - (i-1)];
-        }
-        auto mem_desc = std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
-            Shape(new_shape),
-            real_shape,
-            real_order,
-            0,
-            VectorDims{},
-            strides);
-        internal_mem_k->redefineDesc(mem_desc);
-        internal_mem_v->redefineDesc(mem_desc);
+        auto reset_desc = [&] (size_t new_S) {
+            std::vector<size_t> new_shape = reverse({B, H, (L0 + L1), new_S});
+            VectorDims strides(new_shape.size(), 1);
+            auto real_shape = permute_axes(new_shape, real_order);
+            for (size_t i = 2; i <= real_shape.size(); i++) {
+                strides[real_shape.size() - i] = strides[real_shape.size() - (i-1)] * real_shape[real_shape.size() - (i-1)];
+            }
+            return std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
+                Shape(new_shape),
+                real_shape,
+                real_order,
+                0,
+                VectorDims{},
+                strides);
+        };
+        internal_mem_k->redefineDesc(reset_desc(S));
+        internal_mem_v->redefineDesc(reset_desc(SV));
         if (kvcache_precision == ov::element::u8) {
             auto& old_scale_zp_k = m_k_state->get_scale_zp();
             auto& old_scale_zp_v = m_v_state->get_scale_zp();
@@ -1640,18 +1666,19 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
         // new_shape is the shape used by the original model which maybe different from BHLS, reverse here is to permute BHLS to original model shape.
         // BHLS is the stated input shape of SDPA, however internally we use LBHS for KV-cache storage.
         // real_order is used to permute the original shape to LBHS
-        std::vector<size_t> new_shape = reverse({B, H, (L0 + L1), S});
-        auto real_shape = permute_axes(new_shape, real_order);
-        auto mem_desc =
-            std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
-                                                   Shape(new_shape),
-                                                   real_shape,
-                                                   real_order,
-                                                   0,
-                                                   VectorDims{},
-                                                   internal_mem_k->getDescWithType<BlockedMemoryDesc>()->getStrides());
-        internal_mem_k->redefineDesc(mem_desc);
-        internal_mem_v->redefineDesc(mem_desc);
+        auto redefine_desc = [&] (MemoryPtr& mem, size_t new_S) {
+            std::vector<size_t> new_shape = reverse({B, H, (L0 + L1), new_S});
+            auto real_shape = permute_axes(new_shape, real_order);
+            return std::make_shared<CpuBlockedMemoryDesc>(kvcache_precision,
+                                                    Shape(new_shape),
+                                                    real_shape,
+                                                    real_order,
+                                                    0,
+                                                    VectorDims{},
+                                                    mem->getDescWithType<BlockedMemoryDesc>()->getStrides());
+        };
+        internal_mem_k->redefineDesc(redefine_desc(internal_mem_k, S));
+        internal_mem_v->redefineDesc(redefine_desc(internal_mem_v, SV));
     }
 
     if (!past_k) {
diff --git a/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.cpp b/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.cpp
index e00e5cae8aae74..c2e8ebd92430bf 100644
--- a/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.cpp
+++ b/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.cpp
@@ -20,29 +20,47 @@ class SDPAShapeInfer : public ShapeInferEmptyPads {
     IShapeInfer::Result infer(const std::vector<std::reference_wrapper<const VectorDims>>& input_shapes,
                               const std::unordered_map<size_t, MemoryPtr>& data_dependency) override {
         const auto& query_dims = input_shapes.front().get();
-        VectorDims present_kv_dims = input_shapes.back().get();
+        VectorDims present_v_dims = input_shapes.back().get();
         const auto& beam_idx_dims = input_shapes.end()[-3].get();
         const auto& permute_axes = m_config.permute_axes;
 
         if (permute_axes.empty()) {
             // [B, H, L, S]
-            present_kv_dims[0] = beam_idx_dims[0];
-            present_kv_dims[2] += query_dims[2];
-            return {{query_dims, present_kv_dims, present_kv_dims}, ShapeInferStatus::success};
+            present_v_dims[0] = beam_idx_dims[0];
+            present_v_dims[2] += query_dims[2];
+            // normal and fast path
+            if (present_v_dims[3] == query_dims[3])
+                return {{query_dims, present_v_dims, present_v_dims}, ShapeInferStatus::success};
+
+            // diff kv feature size
+            auto output_dims = query_dims;
+            output_dims[3] = present_v_dims[3];
+            auto present_k_dims = present_v_dims;
+            present_k_dims[3] = query_dims[3];
+            return {{output_dims, present_k_dims, present_v_dims}, ShapeInferStatus::success};
         }
 
         // permute_axes[0,1,2,3] gives axis indices of B,H,L,S for query & present_kv
         const size_t batch_index = permute_axes[0];
         const size_t length_index = permute_axes[2];
-        present_kv_dims[batch_index] = beam_idx_dims[0];
-        present_kv_dims[length_index] += query_dims[length_index];
+        present_v_dims[batch_index] = beam_idx_dims[0];
+        present_v_dims[length_index] += query_dims[length_index];
 
         auto n_dims = query_dims.size();
         VectorDims output_dims(n_dims);
         for (size_t i = 0; i < n_dims; i++) {
             output_dims[i] = query_dims[permute_axes[i]];
         }
-        return {{output_dims, present_kv_dims, present_kv_dims}, ShapeInferStatus::success};
+
+        // normal and fast path
+        if (present_v_dims[3] == query_dims[3])
+            return {{output_dims, present_v_dims, present_v_dims}, ShapeInferStatus::success};
+
+        // diff kv feature size
+        output_dims[3] = present_v_dims[3];
+        auto present_k_dims = present_v_dims;
+        present_k_dims[3] = query_dims[3];
+        return {{output_dims, present_k_dims, present_v_dims}, ShapeInferStatus::success};
     }
 
     port_mask_t get_port_mask() const override {
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/sdpa.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/sdpa.cpp
index 63b4520cf1b0db..4421499d10204d 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/sdpa.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/sdpa.cpp
@@ -27,35 +27,52 @@ void ov::intel_cpu::ScaledDotProductAttentionWithKVCache::validate_and_infer_typ
     // [B, H, L1, S]
     auto q_ps = get_input_partial_shape(0);
     // [B, H, L0, S]
-    auto past_kv_ps = get_input_partial_shape(input_num - 1);
+    auto past_k_ps = get_input_partial_shape(input_num - 2);
+    auto past_v_ps = get_input_partial_shape(input_num - 1);
     // [present_kv_batch_size]
     auto beam_idx_ps = get_input_partial_shape(input_num - 3);
 
     auto output_logits = q_ps;
     NODE_VALIDATION_CHECK(this, m_config.output_BLHxS == false);
+    NODE_VALIDATION_CHECK(this, q_ps.rank().is_static());
     NODE_VALIDATION_CHECK(this, q_ps.size() >= 3);
     // permute_axes from original to [B, H, L, S]
     const auto& permute_axes = this->m_config.permute_axes;
-    if (past_kv_ps.rank().is_static()) {
+    if (past_k_ps.rank().is_static() || past_v_ps.rank().is_static()) {
         const size_t batch_index = permute_axes.empty() ? 0 : permute_axes[0];
         const size_t length_index = permute_axes.empty() ? q_ps.size() - 2 : permute_axes[permute_axes.size() - 2];
         const size_t head_num_index = permute_axes.empty() ? q_ps.size() - 3 : permute_axes[permute_axes.size() - 3];
-        NODE_VALIDATION_CHECK(this, q_ps.size() == past_kv_ps.size());
+        if (past_k_ps.rank().is_static())
+            NODE_VALIDATION_CHECK(this, q_ps.size() == past_k_ps.size());
+        if (past_v_ps.rank().is_static())
+            NODE_VALIDATION_CHECK(this, q_ps.size() == past_v_ps.size());
         for (size_t i = 0; i < q_ps.size(); i++) {
             if (i == head_num_index) {
-                if (q_ps[i].is_static() && past_kv_ps[i].is_static()) {
+                if (q_ps[i].is_static() && past_v_ps[i].is_static()) {
                     NODE_VALIDATION_CHECK(this,
-                                          q_ps[i].get_length() % past_kv_ps[i].get_length() == 0,
+                                          q_ps[i].get_length() % past_v_ps[i].get_length() == 0,
                                           "shape not compatiable at index ",
                                           i);
                 }
+                if (past_k_ps[i].is_static() && past_v_ps[i].is_static()) {
+                    NODE_VALIDATION_CHECK(this,
+                                          past_k_ps[i].get_length() == past_v_ps[i].get_length(),
+                                          "kv shape not compatiable at index ",
+                                          i);
+                }
             } else {
                 continue;
             }
         }
         // batch_size can be dynamically changed by gather logic
-        past_kv_ps[batch_index] = beam_idx_ps[0];
-        past_kv_ps[length_index] += q_ps[length_index];
+        if (past_k_ps.rank().is_static()) {
+            past_k_ps[batch_index] = beam_idx_ps[0];
+            past_k_ps[length_index] += q_ps[length_index];
+        }
+        if (past_v_ps.rank().is_static()) {
+            past_v_ps[batch_index] = beam_idx_ps[0];
+            past_v_ps[length_index] += q_ps[length_index];
+        }
     }
     if (!permute_axes.empty()) {
         if (q_ps.rank().is_static()) {
@@ -65,9 +82,11 @@ void ov::intel_cpu::ScaledDotProductAttentionWithKVCache::validate_and_infer_typ
             }
         }
     }
+    if (output_logits.rank().is_static() && past_v_ps.rank().is_static())
+        output_logits[output_logits.size() - 1] = past_v_ps[output_logits.size() - 1];
     set_output_type(0, get_input_element_type(0), output_logits);
-    set_output_type(1, get_input_element_type(input_num - 1), past_kv_ps);
-    set_output_type(2, get_input_element_type(input_num - 1), past_kv_ps);
+    set_output_type(1, get_input_element_type(input_num - 1), past_k_ps);
+    set_output_type(2, get_input_element_type(input_num - 1), past_v_ps);
 }
 
 bool ov::intel_cpu::ScaledDotProductAttentionWithKVCache::visit_attributes(ov::AttributeVisitor& visitor) {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp
index 8a9212f8998f94..f049a16a7640fc 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp
@@ -38,6 +38,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest,
         ::testing::Combine(::testing::Values(ElementType::f16),
                            ::testing::ValuesIn(inputShapes),
                            ::testing::Values(false),
+                           ::testing::Values(true, false),
                            ::testing::Values(true, false)),
         ConcatSDPTest::getTestCaseName);
 }  // namespace
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp
index f5a7bfacfac99f..83fc0a635546fc 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp
@@ -30,7 +30,8 @@ std::string ConcatSDPTest::getTestCaseName(const testing::TestParamInfo<ConcatSD
     std::vector<InputShape> inputShapes;
     bool forceKVU8;
     bool hasShapeOf;
-    std::tie(inType, inputShapes, forceKVU8, hasShapeOf) = obj.param;
+    bool isDiffKVHeadSize;
+    std::tie(inType, inputShapes, forceKVU8, hasShapeOf, isDiffKVHeadSize) = obj.param;
     std::ostringstream result;
     result << "IS=";
     for (const auto& shape : inputShapes) {
@@ -48,14 +49,15 @@ std::string ConcatSDPTest::getTestCaseName(const testing::TestParamInfo<ConcatSD
     }
     result << "Prc=" << inType << "_";
     result << "ForceKVU8=" << forceKVU8 << "_";
-    result << "HasShapeOf=" << hasShapeOf;
+    result << "HasShapeOf=" << hasShapeOf << "_";
+    result << "IsDiffKVHeadSize=" << isDiffKVHeadSize;
     return result.str();
 }
 
 void ConcatSDPTest::SetUp() {
     ElementType inType;
     std::vector<InputShape> inputShapes;
-    std::tie(inType, inputShapes, m_forceKVU8, m_hasShapeOf) = this->GetParam();
+    std::tie(inType, inputShapes, m_forceKVU8, m_hasShapeOf, m_isDiffKVHeadSize) = this->GetParam();
     targetDevice = ov::test::utils::DEVICE_CPU;
     rel_threshold = 1e-2f;
     if (inType == ElementType::bf16 || inType == ElementType::f16) {
@@ -71,7 +73,11 @@ void ConcatSDPTest::SetUp() {
     // q,k,v
     inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]));
     inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]));
-    inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]));
+    auto v_ps = inputDynamicShapes[0];
+    if (m_isDiffKVHeadSize) {
+        v_ps[3] += m_diffKVHeadSize;
+    }
+    inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, v_ps));
     inputParams[0]->set_friendly_name("q");
     inputParams[1]->set_friendly_name("k");
     inputParams[2]->set_friendly_name("v");
@@ -81,9 +87,15 @@ void ConcatSDPTest::SetUp() {
         ov::op::util::VariableInfo{inputDynamicShapes[1], inType, "pastk"});
     auto pastk = std::make_shared<ov::op::v6::ReadValue>(inputParams[3], var_k);
     pastk->set_friendly_name("pastk_r");
+    // pastv init_cost
+    auto v_init_ps = inputDynamicShapes[1];
+    if (m_isDiffKVHeadSize) {
+        v_init_ps[3] += m_diffKVHeadSize;
+    }
+    inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, v_init_ps));
     auto var_v = std::make_shared<ov::op::util::Variable>(
-        ov::op::util::VariableInfo{inputDynamicShapes[1], inType, "pastv"});
-    auto pastv = std::make_shared<ov::op::v6::ReadValue>(inputParams[3], var_v);
+        ov::op::util::VariableInfo{v_init_ps, inType, "pastv"});
+    auto pastv = std::make_shared<ov::op::v6::ReadValue>(inputParams[4], var_v);
     pastv->set_friendly_name("pastv_r");
     auto beam_idx = std::make_shared<ov::op::v0::Parameter>(ElementType::i32, ov::PartialShape{-1});
     beam_idx->set_friendly_name("beam_idx");
@@ -125,15 +137,6 @@ void ConcatSDPTest::SetUp() {
     manager.run_passes(functionRefs);
 }
 
-void ConcatSDPTest::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
-    std::vector<ov::Shape> shapes(4);
-    shapes[0] = targetInputStaticShapes[0];
-    shapes[1] = targetInputStaticShapes[0];
-    shapes[2] = targetInputStaticShapes[0];
-    shapes[3] = targetInputStaticShapes[1];
-    SubgraphBaseTest::generate_inputs(shapes);
-}
-
 template<typename IT, typename T>
 void strided_iota(IT first, size_t n, T value, T stride) {
     for (size_t i = 0; i < n; i++) {
@@ -163,17 +166,26 @@ void ConcatSDPTest::generate(int idx, const std::vector<ov::Shape>& targetInputS
             strided_iota(static_cast<ov::float16 *>(t.data()), t.get_size(), val, 0.0f);
             inputs.insert({param, t});
         } else {
+            ASSERT_TRUE(param->get_element_type() == element::bf16);
             ov::Tensor t{ov::element::bf16, shape};
             strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
             inputs.insert({param, t});
         }
     };
     // q, k, v, pastkv
+    auto v_shape = targetInputStaticShapes[0];
+    auto v_init_shape = targetInputStaticShapes[1];
+    if (m_isDiffKVHeadSize) {
+        v_shape[3] += m_diffKVHeadSize;
+        v_init_shape[3] += m_diffKVHeadSize;
+    }
+
     create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f);
     create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f);
-    create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f);
+    create_input(function->get_parameters()[2], v_shape, idx + 3.0f);
     create_input(function->get_parameters()[3], targetInputStaticShapes[1], idx + 4.0f);
-    create_input(function->get_parameters()[4], ov::Shape{targetInputStaticShapes[0][0]}, idx + 0.0f);
+    create_input(function->get_parameters()[4], v_init_shape, idx + 4.0f);
+    create_input(function->get_parameters()[5], ov::Shape{targetInputStaticShapes[0][0]}, idx + 0.0f);
 }
 
 void ConcatSDPTest::prepare() {
@@ -214,7 +226,8 @@ TEST_P(ConcatSDPTest, CompareWithRefs) {
     std::vector<InputShape> inputShapes;
     bool forceKVU8;
     bool hasShapeOf;
-    std::tie(inType, inputShapes, forceKVU8, hasShapeOf) = this->GetParam();
+    bool isDiffKVHeadSize;
+    std::tie(inType, inputShapes, forceKVU8, hasShapeOf, isDiffKVHeadSize) = this->GetParam();
 
     auto actualOutputs = run_test(function);
     if (!hasShapeOf) {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp
index ac59e48f496b3b..83e1814f18b2ee 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp
@@ -34,7 +34,7 @@ namespace test {
 template<typename IT, typename T>
 void strided_iota(IT first, size_t n, T value, T stride);
 
-typedef std::tuple<ElementType, std::vector<InputShape>, bool, bool> ConcatSDPTestParams;
+typedef std::tuple<ElementType, std::vector<InputShape>, bool, bool, bool> ConcatSDPTestParams;
 
 class ConcatSDPTest :
         public testing::WithParamInterface<ConcatSDPTestParams>,
@@ -48,9 +48,11 @@ class ConcatSDPTest :
     std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model);
     bool m_forceKVU8;
     bool m_hasShapeOf;
+    bool m_isDiffKVHeadSize;
 protected:
-    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
     void SetUp() override;
+
+    static constexpr size_t m_diffKVHeadSize = 16;
 };
 
 }  // namespace test
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp
index 57927434524891..6761acf8b5dfb1 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp
@@ -38,6 +38,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest,
         ::testing::Combine(::testing::Values(ElementType::f32),
                            ::testing::ValuesIn(inputShapes),
                            ::testing::Values(true, false),
+                           ::testing::Values(true, false),
                            ::testing::Values(true, false)),
         ConcatSDPTest::getTestCaseName);
 
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp
index 93c99048fec349..29667e2ffa3072 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp
@@ -38,6 +38,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest,
         ::testing::Combine(::testing::Values(ElementType::bf16, ElementType::f16),
                            ::testing::ValuesIn(inputShapes),
                            ::testing::Values(true, false),
+                           ::testing::Values(true, false),
                            ::testing::Values(true, false)),
         ConcatSDPTest::getTestCaseName);
 

From 28bb0fd510036545e9b07f3ed2650cc531f78b03 Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Thu, 17 Oct 2024 19:49:32 +0400
Subject: [PATCH 09/32] [PT FE] Handle None-value case among inputs (#27102)

**Details:** Handle None-value case

**Ticket:** TBD

---------

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 src/frontends/pytorch/src/node_context.cpp | 28 ++++++++++++++++------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/frontends/pytorch/src/node_context.cpp b/src/frontends/pytorch/src/node_context.cpp
index 565b0cdbd39385..6a8c370ef2b410 100644
--- a/src/frontends/pytorch/src/node_context.cpp
+++ b/src/frontends/pytorch/src/node_context.cpp
@@ -4,6 +4,7 @@
 
 #include "openvino/frontend/pytorch/node_context.hpp"
 
+#include "helper_ops/internal_op.hpp"
 #include "openvino/core/validation_util.hpp"
 #include "openvino/frontend/exception.hpp"
 #include "openvino/frontend/pytorch/decoder.hpp"
@@ -151,13 +152,26 @@ OutputVector NodeContext::inputs() const {
         if (input == 0) {
             // Case when input can be inlined (possible only for fx decoder)
             if (m_decoder->is_input_inlined(i)) {
-                auto inlined_input = m_decoder->inlined_input(i);
-                FRONT_END_GENERAL_CHECK(inlined_input.size() == 1,
-                                        "Incorrect inlined input with index: ",
-                                        i,
-                                        " for operation ",
-                                        get_op_type());
-                res.push_back(inlined_input[0]);
+                if (input_is_none(i)) {
+                    // some operations like aten.index.Tensor can have None inputs
+                    auto dummy_decoder = std::make_shared<InternalOpDecoder>("torch::None", 1);
+                    auto fw_node = std::make_shared<PtFrameworkNode>(dummy_decoder, OutputVector{});
+                    auto attrs = fw_node->get_attrs();
+                    attrs["none_value"] = "";
+                    attrs[PtFrameworkNode::failed_conversion_key] =
+                        "None constant cannot be converted to OpenVINO opset and should be removed by consuming "
+                        "operation.";
+                    fw_node->set_attrs(attrs);
+                    res.push_back(fw_node->output(0));
+                } else {
+                    auto inlined_input = m_decoder->inlined_input(i);
+                    FRONT_END_GENERAL_CHECK(inlined_input.size() == 1,
+                                            "Incorrect inlined input with index: ",
+                                            i,
+                                            " for operation ",
+                                            get_op_type());
+                    res.push_back(inlined_input[0]);
+                }
                 continue;
             }
         }

From 55d8c47f850076454f0f1a478fca4549a5206021 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 17 Oct 2024 22:24:25 +0200
Subject: [PATCH 10/32] Update setuptools requirement from <74.1.0,>=65.6.1 to
 >=65.6.1,<75.3.0 in /src/bindings/python (#27098)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates the requirements on
[setuptools](https://github.com/pypa/setuptools) to permit the latest
version.
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/pypa/setuptools/blob/main/NEWS.rst">setuptools's
changelog</a>.</em></p>
<blockquote>
<h1>v75.2.0</h1>
<h2>Features</h2>
<ul>
<li>Made errors when parsing <code>Distribution</code> data more
explicit about the expected type (<code>tuple[str, ...] |
list[str]</code>) -- by :user:<code>Avasam</code> (<a
href="https://redirect.github.com/pypa/setuptools/issues/4578">#4578</a>)</li>
</ul>
<h2>Bugfixes</h2>
<ul>
<li>Fix a <code>TypeError</code> when a <code>Distribution</code>'s old
included attribute was a <code>tuple</code> -- by
:user:<code>Avasam</code> (<a
href="https://redirect.github.com/pypa/setuptools/issues/4578">#4578</a>)</li>
<li>Add workaround for <code>bdist_wheel --dist-info-dir</code> errors
when customisation does not inherit from setuptools. (<a
href="https://redirect.github.com/pypa/setuptools/issues/4684">#4684</a>)</li>
</ul>
<h1>v75.1.1</h1>
<h2>Bugfixes</h2>
<ul>
<li>Re-use pre-existing <code>.dist-info</code> dir when creating wheels
via the build backend APIs (PEP 517) and the
<code>metadata_directory</code> argument is passed -- by
:user:<code>pelson</code>. (<a
href="https://redirect.github.com/pypa/setuptools/issues/1825">#1825</a>)</li>
<li>Changed <code>egg_info</code> command to avoid adding an empty
<code>.egg-info</code> directory
while iterating over entry-points.
This avoids triggering integration problems with
<code>importlib.metadata</code>/<code>importlib_metadata</code>
(reference: <a
href="https://redirect.github.com/pypa/pyproject-hooks/issues/206">pypa/pyproject-hooks#206</a>).
(<a
href="https://redirect.github.com/pypa/setuptools/issues/4680">#4680</a>)</li>
</ul>
<h1>v75.1.0</h1>
<h2>Features</h2>
<ul>
<li>Deprecated <code>bdist_wheel.universal</code> configuration. (<a
href="https://redirect.github.com/pypa/setuptools/issues/4617">#4617</a>)</li>
</ul>
<h2>Bugfixes</h2>
<ul>
<li>Removed reference to upload_docs module in entry points. (<a
href="https://redirect.github.com/pypa/setuptools/issues/4650">#4650</a>)</li>
</ul>
<h1>v75.0.0</h1>
<h2>Features</h2>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/pypa/setuptools/commit/61a5a03fbf8acc59e6e12144011aa06b85162bda"><code>61a5a03</code></a>
Bump version: 75.1.1 → 75.2.0</li>
<li><a
href="https://github.com/pypa/setuptools/commit/8ad3ea7509e7f0168f86fbf191ae4e9f13229210"><code>8ad3ea7</code></a>
Workaround for <code>bdist_wheel.dist_info_dir</code> problems (<a
href="https://redirect.github.com/pypa/setuptools/issues/4684">#4684</a>)</li>
<li><a
href="https://github.com/pypa/setuptools/commit/9af08776190841f022136be4192dfeeafd65406d"><code>9af0877</code></a>
Type sequence checks in setuptools/dist.py (<a
href="https://redirect.github.com/pypa/setuptools/issues/4578">#4578</a>)</li>
<li><a
href="https://github.com/pypa/setuptools/commit/0534fde847e0bd0c2214d6821c042c0eb5c0ffc3"><code>0534fde</code></a>
Add news fragment</li>
<li><a
href="https://github.com/pypa/setuptools/commit/50b732a4006f3b84315d4473f7c203e4fe13aed9"><code>50b732a</code></a>
Check for more specific error message</li>
<li><a
href="https://github.com/pypa/setuptools/commit/a663287c9c5f0bfc5e05addfb3a15fea7fc716c3"><code>a663287</code></a>
Add pragma for edge-case code path</li>
<li><a
href="https://github.com/pypa/setuptools/commit/96be735ca2e77b7db876133dfda0b4df3ced4ac0"><code>96be735</code></a>
Workaround for bdist_wheel.dist_info_dir problems</li>
<li><a
href="https://github.com/pypa/setuptools/commit/000a413e2af9c271166cebe6909ad664907887f1"><code>000a413</code></a>
Deprecate public access to setuptools.dist.sequence</li>
<li><a
href="https://github.com/pypa/setuptools/commit/00995c1e3d45393931ffb2e326e503819bee1728"><code>00995c1</code></a>
Use variable msg instead of tmpl in setuptools/dist</li>
<li><a
href="https://github.com/pypa/setuptools/commit/d457d0e87889aefe2093cd79ab4d1ee35d3101e7"><code>d457d0e</code></a>
Type sequence checks in setuptools/dist.py</li>
<li>Additional commits viewable in <a
href="https://github.com/pypa/setuptools/compare/v65.6.1...v75.2.0">compare
view</a></li>
</ul>
</details>
<br />


Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 src/bindings/python/constraints.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bindings/python/constraints.txt b/src/bindings/python/constraints.txt
index bb3d708a0ca23d..b3a8267e4c1f14 100644
--- a/src/bindings/python/constraints.txt
+++ b/src/bindings/python/constraints.txt
@@ -10,7 +10,7 @@ pytest-timeout==2.3.1
 # Python bindings
 py>=1.9.0
 pygments>=2.8.1
-setuptools>=65.6.1,<74.1.0
+setuptools>=65.6.1,<75.3.0
 sympy>=1.10
 wheel>=0.38.1
 patchelf<=0.17.2.1

From 8822480e70c55d16cc2f36d4bfd59ce3e10dd36c Mon Sep 17 00:00:00 2001
From: Tiany1 <54828303+tianyiSKY1@users.noreply.github.com>
Date: Fri, 18 Oct 2024 05:22:18 +0800
Subject: [PATCH 11/32] #20927 support inputs that have no batch (#26778)

#20927
### Details:
 - *add batch dimension before pool*
 - *remove batch dimension after pool*
---
 src/frontends/pytorch/src/op/avg_poolnd.cpp   |  71 +++++++++++-
 src/frontends/pytorch/src/op/max_poolnd.cpp   | 105 ++++++++++++++++--
 src/frontends/pytorch/src/op_table.cpp        |  37 +++---
 .../layer_tests/pytorch_tests/test_pooling.py |  76 ++++++++-----
 4 files changed, 231 insertions(+), 58 deletions(-)

diff --git a/src/frontends/pytorch/src/op/avg_poolnd.cpp b/src/frontends/pytorch/src/op/avg_poolnd.cpp
index 03c32259b45091..d8223b04bfe690 100644
--- a/src/frontends/pytorch/src/op/avg_poolnd.cpp
+++ b/src/frontends/pytorch/src/op/avg_poolnd.cpp
@@ -3,12 +3,17 @@
 //
 
 #include "openvino/frontend/pytorch/node_context.hpp"
+#include "openvino/op/add.hpp"
 #include "openvino/op/avg_pool.hpp"
 #include "openvino/op/broadcast.hpp"
 #include "openvino/op/concat.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/pad.hpp"
-#include "openvino/op/subtract.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/shape_of.hpp"
+#include "openvino/op/slice.hpp"
+#include "openvino/op/squeeze.hpp"
+#include "openvino/op/unsqueeze.hpp"
 #include "utils.hpp"
 
 namespace ov {
@@ -17,10 +22,31 @@ namespace pytorch {
 namespace op {
 
 using namespace ov::op;
-
-OutputVector translate_avg_poolnd(const NodeContext& context) {
+OutputVector translate_avg_pool_base(const NodeContext& context, int dims) {
     num_inputs_check(context, 2, 7);
     auto input = context.get_input(0);
+    auto input_shape = context.mark_node(std::make_shared<v3::ShapeOf>(input));
+
+    auto const_0 = v0::Constant::create(element::i64, Shape{1}, {0});
+    auto const_1 = v0::Constant::create(element::i64, Shape{1}, {1});
+    bool is_static = input.get_partial_shape().rank().is_static();
+    bool no_batch_dim = is_static && input.get_partial_shape().rank().get_length() == dims + 1;
+
+    if (is_static) {
+        if (no_batch_dim) {
+            input = context.mark_node(std::make_shared<v0::Unsqueeze>(input, const_0));
+        }
+    } else {
+        input = context.mark_node(std::make_shared<v0::Unsqueeze>(input, const_0));
+        auto unsqueeze_shape = context.mark_node(std::make_shared<v3::ShapeOf>(input));
+        auto rank = context.mark_node(std::make_shared<v0::ShapeOf>(unsqueeze_shape));
+        auto end_index = context.mark_node(std::make_shared<v1::Add>(rank, const_1));
+        auto start_index = context.mark_node(v0::Constant::create(element::i64, Shape{1}, {-dims - 2}));
+        auto reshape_pattern =
+            context.mark_node(std::make_shared<v8::Slice>(unsqueeze_shape, start_index, end_index, const_1, const_0));
+        input = context.mark_node(std::make_shared<v1::Reshape>(input, reshape_pattern, true));
+    }
+
     auto kernel = context.const_input<Shape>(1);
     Strides strides;
     if (!context.input_is_none(2)) {
@@ -47,8 +73,43 @@ OutputVector translate_avg_poolnd(const NodeContext& context) {
     }
     PYTORCH_OP_CONVERSION_CHECK(context.input_is_none(6),
                                 "Translation for aten::avg_pool2d do not support divisor_override input.");
-    return {context.mark_node(
-        std::make_shared<v14::AvgPool>(input, strides, pads, pads, kernel, !count_include_pad, rounding_type))};
+    auto res = context.mark_node(
+        std::make_shared<v14::AvgPool>(input, strides, pads, pads, kernel, !count_include_pad, rounding_type));
+
+    if (is_static) {
+        if (no_batch_dim) {
+            res = context.mark_node(std::make_shared<v0::Squeeze>(res, const_0));
+        }
+    } else {
+        auto pooled_output_shape = context.mark_node(std::make_shared<v3::ShapeOf>(res));
+
+        auto start_index_input = context.mark_node(v0::Constant::create(element::i64, Shape{1}, {-dims}));
+        auto slice_input_shape =
+            context.mark_node(std::make_shared<v8::Slice>(input_shape, const_0, start_index_input, const_1, const_0));
+
+        auto start_index_pooled = context.mark_node(v0::Constant::create(element::i64, Shape{1}, {-dims}));
+        auto end_index_pooled = context.mark_node(v0::Constant::create(element::i64, Shape{1}, {2 + dims}));
+        auto slice_pooled_output_shape = context.mark_node(
+            std::make_shared<v8::Slice>(pooled_output_shape, start_index_pooled, end_index_pooled, const_1, const_0));
+
+        auto concat_shape = context.mark_node(
+            std::make_shared<v0::Concat>(OutputVector{slice_input_shape, slice_pooled_output_shape}, 0));
+        res = context.mark_node(std::make_shared<v1::Reshape>(res, concat_shape, true));
+    }
+
+    return {res};
+};
+
+OutputVector translate_avg_pool1d(const NodeContext& context) {
+    return translate_avg_pool_base(context, 1);
+};
+
+OutputVector translate_avg_pool2d(const NodeContext& context) {
+    return translate_avg_pool_base(context, 2);
+};
+
+OutputVector translate_avg_pool3d(const NodeContext& context) {
+    return translate_avg_pool_base(context, 3);
 };
 
 }  // namespace op
diff --git a/src/frontends/pytorch/src/op/max_poolnd.cpp b/src/frontends/pytorch/src/op/max_poolnd.cpp
index b6a01af1a7c2df..b846de68d28b49 100644
--- a/src/frontends/pytorch/src/op/max_poolnd.cpp
+++ b/src/frontends/pytorch/src/op/max_poolnd.cpp
@@ -12,9 +12,13 @@
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/pad.hpp"
 #include "openvino/op/range.hpp"
+#include "openvino/op/reshape.hpp"
 #include "openvino/op/select.hpp"
 #include "openvino/op/shape_of.hpp"
+#include "openvino/op/slice.hpp"
+#include "openvino/op/squeeze.hpp"
 #include "openvino/op/subtract.hpp"
+#include "openvino/op/unsqueeze.hpp"
 #include "openvino/op/util/framework_node.hpp"
 #include "utils.hpp"
 
@@ -24,9 +28,31 @@ namespace pytorch {
 namespace op {
 
 using namespace ov::op;
-
-OutputVector translate_max_poolnd(const NodeContext& context) {
+OutputVector translate_max_pool_base(const NodeContext& context, int dims) {
     num_inputs_check(context, 3, 6);
+    auto input = context.get_input(0);
+    auto input_shape = context.mark_node(std::make_shared<v3::ShapeOf>(input));
+
+    auto const_0 = v0::Constant::create(element::i64, Shape{1}, {0});
+    auto const_1 = v0::Constant::create(element::i64, Shape{1}, {1});
+    bool is_static = input.get_partial_shape().rank().is_static();
+    bool no_batch_dim = is_static && input.get_partial_shape().rank().get_length() == dims + 1;
+
+    if (is_static) {
+        if (no_batch_dim) {
+            input = context.mark_node(std::make_shared<v0::Unsqueeze>(input, const_0));
+        }
+    } else {
+        input = context.mark_node(std::make_shared<v0::Unsqueeze>(input, const_0));
+        auto unsqueeze_shape = context.mark_node(std::make_shared<v3::ShapeOf>(input));
+        auto rank = context.mark_node(std::make_shared<v0::ShapeOf>(unsqueeze_shape));
+        auto end_index = context.mark_node(std::make_shared<v1::Add>(rank, const_1));
+        auto start_index = context.mark_node(v0::Constant::create(element::i64, Shape{1}, {-dims - 2}));
+        auto reshape_pattern =
+            context.mark_node(std::make_shared<v8::Slice>(unsqueeze_shape, start_index, end_index, const_1, const_0));
+        input = context.mark_node(std::make_shared<v1::Reshape>(input, reshape_pattern, true));
+    }
+
     auto kernel = context.const_input<Shape>(1);
     Strides strides;
     if (!context.input_is_none(2)) {
@@ -53,7 +79,7 @@ OutputVector translate_max_poolnd(const NodeContext& context) {
         rounding_type = context.const_input<bool>(5) ? RoundingType::CEIL_TORCH : RoundingType::FLOOR;
     }
 
-    auto res = context.mark_node(std::make_shared<v14::MaxPool>(context.get_input(0),
+    auto res = context.mark_node(std::make_shared<v14::MaxPool>(input,
                                                                 strides,
                                                                 dilations,
                                                                 pads,
@@ -63,19 +89,76 @@ OutputVector translate_max_poolnd(const NodeContext& context) {
                                                                 PadType::EXPLICIT,
                                                                 element::i64,
                                                                 2));
-    if (context.get_output_size() == 2) {
-        auto out1 = res->output(0);
-        auto out2 = res->output(1);
-        return {std::move(out1), std::move(out2)};
+    if (is_static) {
+        if (no_batch_dim) {
+            if (context.get_output_size() == 2) {
+                auto out1 = res->output(0);
+                auto out2 = res->output(1);
+                out1 = context.mark_node(std::make_shared<v0::Squeeze>(out1, const_0));
+                out2 = context.mark_node(std::make_shared<v0::Squeeze>(out2, const_0));
+                return {std::move(out1), std::move(out2)};
+            } else {
+                res = context.mark_node(std::make_shared<v0::Squeeze>(res, const_0));
+                return {res};
+            }
+        } else {
+            if (context.get_output_size() == 2) {
+                auto out1 = res->output(0);
+                auto out2 = res->output(1);
+                return {std::move(out1), std::move(out2)};
+            } else {
+                return {res};
+            }
+        }
+
     } else {
-        return {res};
+        auto pooled_output_shape = context.mark_node(std::make_shared<v3::ShapeOf>(res));
+
+        auto start_index_input = context.mark_node(v0::Constant::create(element::i64, Shape{1}, {-dims}));
+        auto slice_input_shape =
+            context.mark_node(std::make_shared<v8::Slice>(input_shape, const_0, start_index_input, const_1, const_0));
+
+        auto start_index_pooled = context.mark_node(v0::Constant::create(element::i64, Shape{1}, {-dims}));
+        auto end_index_pooled = context.mark_node(v0::Constant::create(element::i64, Shape{1}, {2 + dims}));
+        auto slice_pooled_output_shape = context.mark_node(
+            std::make_shared<v8::Slice>(pooled_output_shape, start_index_pooled, end_index_pooled, const_1, const_0));
+
+        auto concat_shape = context.mark_node(
+            std::make_shared<v0::Concat>(OutputVector{slice_input_shape, slice_pooled_output_shape}, 0));
+        if (context.get_output_size() == 2) {
+            auto out1 = res->output(0);
+            auto out2 = res->output(1);
+            out1 = context.mark_node(std::make_shared<v1::Reshape>(out1, concat_shape, true));
+            out2 = context.mark_node(std::make_shared<v1::Reshape>(out2, concat_shape, true));
+            return {std::move(out1), std::move(out2)};
+        } else {
+            res = context.mark_node(std::make_shared<v1::Reshape>(res, concat_shape, true));
+            return {res};
+        }
     }
 };
 
-OutputVector translate_max_poolnd_fx(const NodeContext& context) {
-    auto output = translate_max_poolnd(context);
+OutputVector translate_max_pool1d(const NodeContext& context) {
+    return translate_max_pool_base(context, 1);
+};
+
+OutputVector translate_max_pool2d(const NodeContext& context) {
+    return translate_max_pool_base(context, 2);
+};
+
+OutputVector translate_max_pool3d(const NodeContext& context) {
+    return translate_max_pool_base(context, 3);
+};
+
+OutputVector translate_max_pool2d_fx(const NodeContext& context) {
+    auto output = translate_max_pool2d(context);
     return {context.mark_node(make_list_construct(output))};
-}
+};
+
+OutputVector translate_max_pool3d_fx(const NodeContext& context) {
+    auto output = translate_max_pool3d(context);
+    return {context.mark_node(make_list_construct(output))};
+};
 
 }  // namespace op
 }  // namespace pytorch
diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp
index b68c182e17ee2a..5d63a6dc037b14 100644
--- a/src/frontends/pytorch/src/op_table.cpp
+++ b/src/frontends/pytorch/src/op_table.cpp
@@ -42,7 +42,9 @@ OP_CONVERTER(translate_argmax);
 OP_CONVERTER(translate_argmin);
 OP_CONVERTER(translate_as_strided);
 OP_CONVERTER(translate_as_tensor);
-OP_CONVERTER(translate_avg_poolnd);
+OP_CONVERTER(translate_avg_pool1d);
+OP_CONVERTER(translate_avg_pool2d);
+OP_CONVERTER(translate_avg_pool3d);
 OP_CONVERTER(translate_bool);
 OP_CONVERTER(translate_batch_norm);
 OP_CONVERTER(translate_bitwise_and);
@@ -139,7 +141,9 @@ OP_CONVERTER(translate_masked_scatter);
 OP_CONVERTER(translate_masked_select);
 OP_CONVERTER(translate_max);
 OP_CONVERTER(translate_maximum);
-OP_CONVERTER(translate_max_poolnd);
+OP_CONVERTER(translate_max_pool1d);
+OP_CONVERTER(translate_max_pool2d);
+OP_CONVERTER(translate_max_pool3d);
 OP_CONVERTER(translate_mean);
 OP_CONVERTER(translate_meshgrid);
 OP_CONVERTER(translate_min);
@@ -281,7 +285,8 @@ OP_CONVERTER(translate_leaky_relu_fx);
 OP_CONVERTER(translate_log_sigmoid_fx);
 OP_CONVERTER(translate_log_softmax_fx);
 OP_CONVERTER(translate_max_dim_fx);
-OP_CONVERTER(translate_max_poolnd_fx);
+OP_CONVERTER(translate_max_pool2d_fx);
+OP_CONVERTER(translate_max_pool3d_fx);
 OP_CONVERTER(translate_mean_fx);
 OP_CONVERTER(translate_min_dim_fx);
 OP_CONVERTER(translate_new_full_fx);
@@ -380,9 +385,9 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_ts() {
         {"aten::atanh",
          op::optional_out<op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Atanh>, 1>},
         {"aten::atanh_", op::inplace_op<op::translate_1to1_match_1_inputs<opset10::Atanh>>},
-        {"aten::avg_pool1d", op::quantizable_op<op::translate_avg_poolnd>},
-        {"aten::avg_pool2d", op::quantizable_op<op::translate_avg_poolnd>},
-        {"aten::avg_pool3d", op::quantizable_op<op::translate_avg_poolnd>},
+        {"aten::avg_pool1d", op::quantizable_op<op::translate_avg_pool1d>},
+        {"aten::avg_pool2d", op::quantizable_op<op::translate_avg_pool2d>},
+        {"aten::avg_pool3d", op::quantizable_op<op::translate_avg_pool3d>},
         {"aten::baddbmm", op::translate_addmm},
         {"aten::batch_norm", op::translate_batch_norm},
         {"aten::bitwise_and", op::translate_bitwise_and},
@@ -534,12 +539,12 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_ts() {
         {"aten::max", op::translate_max},
         {"aten::mv", op::translate_1to1_match_2_inputs<opset10::MatMul>},
         {"aten::maximum", op::translate_maximum},
-        {"aten::max_pool1d", op::quantizable_op<op::translate_max_poolnd>},
-        {"aten::max_pool1d_with_indices", op::quantizable_op<op::translate_max_poolnd>},
-        {"aten::max_pool2d", op::quantizable_op<op::translate_max_poolnd>},
-        {"aten::max_pool2d_with_indices", op::quantizable_op<op::translate_max_poolnd>},
-        {"aten::max_pool3d", op::quantizable_op<op::translate_max_poolnd>},
-        {"aten::max_pool3d_with_indices", op::quantizable_op<op::translate_max_poolnd>},
+        {"aten::max_pool1d", op::quantizable_op<op::translate_max_pool1d>},
+        {"aten::max_pool1d_with_indices", op::quantizable_op<op::translate_max_pool1d>},
+        {"aten::max_pool2d", op::quantizable_op<op::translate_max_pool2d>},
+        {"aten::max_pool2d_with_indices", op::quantizable_op<op::translate_max_pool2d>},
+        {"aten::max_pool3d", op::quantizable_op<op::translate_max_pool3d>},
+        {"aten::max_pool3d_with_indices", op::quantizable_op<op::translate_max_pool3d>},
         {"aten::mean", op::quantizable_op<op::translate_mean>},
         {"aten::meshgrid", op::translate_meshgrid},
         {"aten::min", op::translate_min},
@@ -771,8 +776,8 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.asinh.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Asinh>},
         {"aten.atan.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Atan>},
         {"aten.atanh.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Atanh>},
-        {"aten.avg_pool2d.default", op::translate_avg_poolnd},
-        {"aten.avg_pool3d.default", op::translate_avg_poolnd},
+        {"aten.avg_pool2d.default", op::translate_avg_pool2d},
+        {"aten.avg_pool3d.default", op::translate_avg_pool3d},
         {"aten.baddbmm.default", op::translate_addmm_fx},
         {"aten.bitwise_and.Scalar", op::translate_bitwise_and},
         {"aten.bitwise_and.Tensor", op::translate_bitwise_and},
@@ -870,8 +875,8 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.masked_fill_.Tensor", op::inplace_op<op::translate_masked_fill>},
         {"aten.max.default", op::translate_max},
         {"aten.max.dim", op::translate_max_dim_fx},
-        {"aten.max_pool2d_with_indices.default", op::translate_max_poolnd_fx},
-        {"aten.max_pool3d_with_indices.default", op::translate_max_poolnd_fx},
+        {"aten.max_pool2d_with_indices.default", op::translate_max_pool2d_fx},
+        {"aten.max_pool3d_with_indices.default", op::translate_max_pool3d_fx},
         {"aten.maximum.default", op::translate_maximum},
         {"aten.mean.default", op::translate_mean_fx},
         {"aten.mean.dim", op::translate_mean_fx},
diff --git a/tests/layer_tests/pytorch_tests/test_pooling.py b/tests/layer_tests/pytorch_tests/test_pooling.py
index 32c8a973cb1c92..1924df2484f177 100644
--- a/tests/layer_tests/pytorch_tests/test_pooling.py
+++ b/tests/layer_tests/pytorch_tests/test_pooling.py
@@ -36,10 +36,8 @@
 
 
 class TestPooling(PytorchLayerTest):
-    def _prepare_input(self, ndim=4):
-        import numpy as np
-        shape = (1, 3, 15, 15, 15)
-        return (np.random.randn(*shape[:ndim]).astype(np.float32),)
+    def _prepare_input(self):
+        return (self.input_tensor,)
 
     def create_model(self, op_type, kernel_size, stride, padding, dilation=1, ceil_mode=True, count_include_pad=True, dtype=torch.float32):
         class aten_avg_pooling_base(torch.nn.Module):
@@ -129,121 +127,147 @@ def forward(self, x):
 
         return aten_pooling(), ref_net, f"aten::{op_type}"
 
+    @pytest.mark.parametrize("input_shape", [[1, 3, 15], [3, 15]])
     @pytest.mark.parametrize("params", d1_params)
     @pytest.mark.parametrize("ceil_mode", [True, False])
     @pytest.mark.parametrize("count_include_pad", [True, False])
+    @pytest.mark.parametrize("is_dynamic_shapes", [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
     @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
                        reason='Ticket - 122715')
-    def test_avg_pool1d(self, params, ceil_mode, count_include_pad, ie_device, precision, ir_version):
+    def test_avg_pool1d(self, input_shape, params, ceil_mode, count_include_pad, ie_device, precision, ir_version, is_dynamic_shapes):
+        self.input_tensor = np.random.randn(*input_shape).astype(np.float32)
         self._test(*self.create_model("avg_pool1d", **params, ceil_mode=ceil_mode, count_include_pad=count_include_pad),
-                   ie_device, precision, ir_version, kwargs_to_prepare_input={'ndim': 3}, trace_model=True,
-                   dynamic_shapes=False)
+                   ie_device, precision, ir_version, trace_model=True,
+                   dynamic_shapes=is_dynamic_shapes)
 
-    @pytest.mark.parametrize(
-        "params", d2_params)
+    @pytest.mark.parametrize("input_shape", [[1, 3, 15, 15], [3, 15, 15]])
+    @pytest.mark.parametrize("params", d2_params)
     @pytest.mark.parametrize("ceil_mode", [True, False])
     @pytest.mark.parametrize("count_include_pad", [True, False])
+    @pytest.mark.parametrize("is_dynamic_shapes", [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
     @pytest.mark.precommit_fx_backend
     @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
                        reason='Ticket - 122715')
-    def test_avg_pool2d(self, params, ceil_mode, count_include_pad, ie_device, precision, ir_version):
+    def test_avg_pool2d(self, input_shape, params, ceil_mode, count_include_pad, ie_device, precision, ir_version, is_dynamic_shapes):
         if ceil_mode and count_include_pad and np.array_equal(np.array(params["kernel_size"]), np.array([8, 8])):
             pytest.xfail("Ticket - 150292")
+        self.input_tensor = np.random.randn(*input_shape).astype(np.float32)
         self._test(*self.create_model("avg_pool2d", **params, ceil_mode=ceil_mode, count_include_pad=count_include_pad),
-                   ie_device, precision, ir_version, trace_model=True, freeze_model=False, dynamic_shapes=False)
+                   ie_device, precision, ir_version, trace_model=True, freeze_model=False, dynamic_shapes=is_dynamic_shapes)
 
+    @pytest.mark.parametrize("input_shape", [[1, 3, 15, 15, 15], [3, 15, 15, 15]])
     @pytest.mark.parametrize("params", d3_params)
     @pytest.mark.parametrize("ceil_mode", [True, False])
     @pytest.mark.parametrize("count_include_pad", [True, False])
+    @pytest.mark.parametrize("is_dynamic_shapes", [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
     @pytest.mark.precommit_fx_backend
     @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
                        reason='Ticket - 122715')
-    def test_avg_pool3d(self, params, ceil_mode, count_include_pad, ie_device, precision, ir_version):
+    def test_avg_pool3d(self, input_shape, params, ceil_mode, count_include_pad, ie_device, precision, ir_version, is_dynamic_shapes):
+        self.input_tensor = np.random.randn(*input_shape).astype(np.float32)
         self._test(*self.create_model("avg_pool3d", **params, ceil_mode=ceil_mode, count_include_pad=count_include_pad),
-                   ie_device, precision, ir_version, kwargs_to_prepare_input={'ndim': 5}, trace_model=True,
-                   dynamic_shapes=False)
+                   ie_device, precision, ir_version, trace_model=True,
+                   dynamic_shapes=is_dynamic_shapes)
 
+    @pytest.mark.parametrize("input_shape", [[1, 3, 15], [3, 15]])
     @pytest.mark.parametrize("params", d1_params)
     @pytest.mark.parametrize("ceil_mode", [True, False])
     @pytest.mark.parametrize("dilation", [1, 2])
+    @pytest.mark.parametrize("is_dynamic_shapes", [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
     @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
                        reason='Ticket - 122715')
-    def test_max_pool1d(self, params, ceil_mode, dilation, ie_device, precision, ir_version):
+    def test_max_pool1d(self, input_shape, params, ceil_mode, dilation, ie_device, precision, ir_version, is_dynamic_shapes):
+        self.input_tensor = np.random.randn(*input_shape).astype(np.float32)
         self._test(*self.create_model("max_pool1d", **params, ceil_mode=ceil_mode, dilation=dilation),
-                   ie_device, precision, ir_version, kwargs_to_prepare_input={'ndim': 3}, dynamic_shapes=False)
+                   ie_device, precision, ir_version, dynamic_shapes=is_dynamic_shapes)
 
+    @pytest.mark.parametrize("input_shape", [[1, 3, 15, 15], [3, 15, 15]])
     @pytest.mark.parametrize("params", d2_params)
     @pytest.mark.parametrize("ceil_mode", [True, False])
     @pytest.mark.parametrize("dilation", [1, 2])
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int32])
+    @pytest.mark.parametrize("is_dynamic_shapes", [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
                        reason='Ticket - 122715')
-    def test_max_pool2d(self, params, ceil_mode, dilation, dtype, ie_device, precision, ir_version):
+    def test_max_pool2d(self, input_shape, params, ceil_mode, dilation, dtype, ie_device, precision, ir_version, is_dynamic_shapes):
         to_trace = False
         if params["stride"] == []:
             to_trace = True
+        self.input_tensor = np.random.randn(*input_shape).astype(np.float32)
         self._test(*self.create_model("max_pool2d", **params, ceil_mode=ceil_mode, dilation=dilation, dtype=dtype),
-                   ie_device, precision, ir_version, dynamic_shapes=False, trace_model=to_trace)
+                   ie_device, precision, ir_version, dynamic_shapes=is_dynamic_shapes, trace_model=to_trace)
 
+    @pytest.mark.parametrize("input_shape", [[1, 3, 15, 15, 15], [3, 15, 15, 15]])
     @pytest.mark.parametrize("params", d3_params)
     @pytest.mark.parametrize("ceil_mode", [True, False])
     @pytest.mark.parametrize("dilation", [1, 2])
+    @pytest.mark.parametrize("is_dynamic_shapes", [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
                        reason='Ticket - 122715')
-    def test_max_pool3d(self, params, ceil_mode, dilation, ie_device, precision, ir_version):
+    def test_max_pool3d(self, input_shape, params, ceil_mode, dilation, ie_device, precision, ir_version, is_dynamic_shapes):
+        self.input_tensor = np.random.randn(*input_shape).astype(np.float32)
         self._test(*self.create_model("max_pool3d", **params, ceil_mode=ceil_mode, dilation=dilation),
-                   ie_device, precision, ir_version, kwargs_to_prepare_input={'ndim': 5}, dynamic_shapes=False)
+                   ie_device, precision, ir_version,  dynamic_shapes=is_dynamic_shapes)
 
+    @pytest.mark.parametrize("input_shape", [[1, 3, 15], [3, 15]])
     @pytest.mark.parametrize("params", d1_params)
     @pytest.mark.parametrize("ceil_mode", [True, False])
     @pytest.mark.parametrize("dilation", [1, 2])
+    @pytest.mark.parametrize("is_dynamic_shapes", [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
                        reason='Ticket - 122715')
-    def test_max_pool1d_indices(self, params, ceil_mode, dilation, ie_device, precision, ir_version):
+    def test_max_pool1d_indices(self, input_shape, params, ceil_mode, dilation, ie_device, precision, ir_version, is_dynamic_shapes):
+        self.input_tensor = np.random.randn(*input_shape).astype(np.float32)
         self._test(*self.create_model("max_pool1d_with_indices", **params, ceil_mode=ceil_mode, dilation=dilation),
-                   ie_device, precision, ir_version, kwargs_to_prepare_input={'ndim': 3}, dynamic_shapes=False)
+                   ie_device, precision, ir_version, dynamic_shapes=is_dynamic_shapes)
 
+    @pytest.mark.parametrize("input_shape", [[1, 3, 15, 15], [3, 15, 15]])
     @pytest.mark.parametrize("params", d2_params)
     @pytest.mark.parametrize("ceil_mode", [True, False])
     @pytest.mark.parametrize("dilation", [1, 2])
+    @pytest.mark.parametrize("is_dynamic_shapes", [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_fx_backend
     @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
                        reason='Ticket - 122715')
-    def test_max_pool2d_indices(self, params, ceil_mode, dilation,  ie_device, precision, ir_version):
+    def test_max_pool2d_indices(self, input_shape, params, ceil_mode, dilation,  ie_device, precision, ir_version, is_dynamic_shapes):
         to_trace = False
         if params["stride"] == []:
             to_trace = True
+        self.input_tensor = np.random.randn(*input_shape).astype(np.float32)
         self._test(*self.create_model("max_pool2d_with_indices", **params, ceil_mode=ceil_mode, dilation=dilation),
-                   ie_device, precision, ir_version, dynamic_shapes=False, trace_model=to_trace)
+                   ie_device, precision, ir_version, dynamic_shapes=is_dynamic_shapes, trace_model=to_trace)
 
+    @pytest.mark.parametrize("input_shape", [[1, 3, 15, 15, 15], [3, 15, 15, 15]])
     @pytest.mark.parametrize("params", d3_params)
     @pytest.mark.parametrize("ceil_mode", [True, False])
     @pytest.mark.parametrize("dilation", [1, 2])
+    @pytest.mark.parametrize("is_dynamic_shapes", [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_fx_backend
     @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64',
                        reason='Ticket - 122715')
-    def test_max_pool3d_indices(self, params, ceil_mode, dilation, ie_device, precision, ir_version):
+    def test_max_pool3d_indices(self, input_shape, params, ceil_mode, dilation, ie_device, precision, ir_version, is_dynamic_shapes):
+        self.input_tensor = np.random.randn(*input_shape).astype(np.float32)
         self._test(*self.create_model("max_pool3d_with_indices", **params, ceil_mode=ceil_mode, dilation=dilation),
-                   ie_device, precision, ir_version, kwargs_to_prepare_input={'ndim': 5}, dynamic_shapes=False)
+                   ie_device, precision, ir_version, dynamic_shapes=is_dynamic_shapes)

From 212be8e18b57684bebfd17c2290098008a815c46 Mon Sep 17 00:00:00 2001
From: Dmitry Matveev <dmitry.matveev@intel.com>
Date: Fri, 18 Oct 2024 01:13:00 +0100
Subject: [PATCH 12/32] NPUW: Dynamic Spatial (#27104)

### Details:
- Introduce a new SPATIAL pipeline which is a shortcut to
PIPELINE:REG+ISOLATE:COMPUTE+SPATIAL:ON;
- Refactor some code re: spatial regions handling in models and
requests;
- Finally, introduce a dyn dispatch over the spatial range
  - Based on runtime-detected features
  - Can be disabled to measure full range performance

### Tickets:
 - E-143572
---
 .../al/include/intel_npu/al/config/npuw.hpp   |   3 +-
 .../al/include/npuw_private_properties.hpp    |  12 +-
 .../intel_npu/src/al/src/config/npuw.cpp      |   1 +
 .../src/plugin/npuw/compiled_model.cpp        |  17 +--
 .../src/plugin/npuw/compiled_model.hpp        |  18 +--
 .../plugin/npuw/just_sync_infer_request.cpp   |  35 +++++-
 .../plugin/npuw/just_sync_infer_request.hpp   |   5 +-
 .../npuw/partitioning/online/compiler.cpp     |  25 ++++-
 .../plugin/npuw/partitioning/partitioning.cpp |   2 +-
 .../plugin/npuw/partitioning/partitioning.hpp |  16 +--
 .../intel_npu/src/plugin/npuw/spatial.cpp     |  44 ++++++++
 .../intel_npu/src/plugin/npuw/spatial.hpp     | 106 ++++++++++++++++++
 .../intel_npu/src/plugin/npuw/util.cpp        |  37 ++++++
 .../intel_npu/src/plugin/npuw/util.hpp        |   4 +
 14 files changed, 270 insertions(+), 55 deletions(-)
 create mode 100644 src/plugins/intel_npu/src/plugin/npuw/spatial.cpp
 create mode 100644 src/plugins/intel_npu/src/plugin/npuw/spatial.hpp

diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
index fef9470545482a..b0ecf3cd45d152 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
@@ -45,7 +45,8 @@ DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
 DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime);
 DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
 DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime);
-DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 64, npuw::partitioning::spatial_nway, CompileTime);
+DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, CompileTime);
+DEFINE_OPT(NPUW_SPATIAL_DYN, bool, true, npuw::partitioning::spatial_dyn, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
index 059977ee47a063..834f90db9cf9ef 100644
--- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
@@ -67,7 +67,7 @@ namespace online {
  * @brief
  * Type: std::string.
  * Specify which partitioning pipeline to run.
- * Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE".
+ * Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE", "SPATIAL".
  * Default value: "REG".
  */
 static constexpr ov::Property<std::string> pipeline{"NPUW_ONLINE_PIPELINE"};
@@ -206,10 +206,18 @@ static constexpr ov::Property<bool> spatial{"NPUW_SPATIAL"};
  * @brief
  * Type: std::size_t.
  * Submission size for the spatial execution.
- * Default value: 64
+ * Default value: 128
  */
 static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
 
+/**
+ * @brief
+ * Type: boolean.
+ * Enable dynamic submission for spatial subgraphs. Requires SPATIAL pipeline to be selected.
+ * Default value: true
+ */
+static constexpr ov::Property<bool> spatial_dyn{"NPUW_SPATIAL_DYN"};
+
 /**
  * @brief
  * Type: boolean
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
index 4aa4a88b9b5ba7..b5180633e4357e 100644
--- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -30,6 +30,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_PMM>();
     desc.add<NPUW_SPATIAL>();
     desc.add<NPUW_SPATIAL_NWAY>();
+    desc.add<NPUW_SPATIAL_DYN>();
     desc.add<NPUW_HOST_GATHER>();
     desc.add<NPUW_DCOFF_TYPE>();
     desc.add<NPUW_DCOFF_SCALE>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index a312a806cac4bc..6ae61fc42410b8 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -283,18 +283,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
 
                 // Fill in the spatial information, if it is present
                 if (fcn_template._spatial) {
-                    using S = CompiledModelDesc::Spatial;
-                    S s;
-                    s.range = fcn_template._spatial->_range;
-                    s.nway = fcn_template._spatial->_slice;
-                    s.out_dim = fcn_template._spatial->_out_dim;
-                    s.nway_iters = s.range / s.nway;
-                    s.tail_size = s.range % s.nway;
-                    for (auto&& input : fcn_template._spatial->_inputs) {
-                        std::size_t p_idx = fcn_template._model->get_parameter_index(input.param);
-                        s.params.push_back(S::Param{p_idx, input.dim});
-                    }
-                    m_compiled_submodels[id].spatial = std::move(s);
+                    m_compiled_submodels[id].spatial =
+                        compiled::Spatial(fcn_template._spatial.value(), fcn_template._model);
                 }
                 LOG_INFO("Subgraph[" << id << "] is a function body for " << subgraph._funcall);
             } else {
@@ -918,7 +908,8 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
                           BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
                           BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
-                          BIND(npuw::partitioning::spatial, NPUW_SPATIAL_NWAY),
+                          BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),
+                          BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
                           BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
                           BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
                           BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index 038c1bb176b029..7a02ae1c8a485a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023 Intel Corporation
+// Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -13,6 +13,7 @@
 #include "openvino/runtime/icompiled_model.hpp"
 #include "openvino/runtime/so_ptr.hpp"
 #include "partitioning/partitioning.hpp"
+#include "spatial.hpp"
 #include "weights_bank.hpp"
 
 namespace intel_npu {
@@ -123,20 +124,7 @@ class CompiledModel : public ov::ICompiledModel {
         std::optional<std::size_t> replaced_by;
 
         Subgraph::Gather host_gather;
-        struct Spatial {
-            struct Param {
-                std::size_t idx;
-                std::size_t dim;
-            };
-            std::vector<Param> params;
-            std::size_t range = 0u;
-            std::size_t nway = 0u;
-            std::size_t out_dim = 0u;
-
-            std::size_t nway_iters = 0u;
-            std::size_t tail_size = 0u;
-        };
-        std::optional<Spatial> spatial;
+        std::optional<ov::npuw::compiled::Spatial> spatial;
 
         // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
         // w.r.t. function calls
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
index c4e2c3ee98b676..bac69f0a3b0d36 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -199,6 +199,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
     // Create infer requests
     // Preallocate funcall tensors & substitute function call requests
     bool failover_happened = false;
+    bool has_spatial = false;
     for (size_t i = 0; i < m_num_submodels; i++) {
         LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
         LOG_BLOCK();
@@ -221,6 +222,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
 
             // Initialize the spatial IO placeholders, if required
             if (proto_comp_model_desc.spatial) {
+                has_spatial = true;
+
                 m_spatial_io[real_idx].inputs.resize(proto_comp_model_desc.param_base);
                 m_spatial_io[real_idx].input_tails.resize(proto_comp_model_desc.param_base);
                 m_spatial_io[real_idx].outputs.resize(num_outputs);
@@ -399,6 +402,24 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         }  // for(closure)
         LOG_VERB("DONE");
     }
+
+    // Handle spatial dynamic submission
+    if (has_spatial) {
+        if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_SPATIAL_DYN>()) {
+            LOG_VERB("Finding spatial features...");
+            LOG_BLOCK();
+            m_spatial_selector = runtime::spatial::AttentionMask::find(*this);
+            if (!m_spatial_selector) {
+                LOG_WARN("Spatial capability is enabled, but no run-time features were found.");
+                // Fallback selector to ALL
+                m_spatial_selector.reset(new runtime::spatial::All());
+            }
+        } else {
+            // Just force selector to ALL
+            m_spatial_selector.reset(new runtime::spatial::All());
+        }
+        LOG_VERB("Done");
+    }
 }
 
 void ov::npuw::JustInferRequest::connect_subrequests() {
@@ -506,6 +527,11 @@ void ov::npuw::JustInferRequest::prepare_for_infer() {
         LOG_DEBUG("Pre-initializing weights for subgraph[" << id << "]");
         unpack_closure(id, m_subrequests[id]);
     }
+
+    // Adjust spatial input range, if supported
+    if (m_spatial_selector) {
+        m_spatial_selector->prepare();
+    }
     LOG_DEBUG("Done");
 }
 
@@ -915,6 +941,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
         // must be prepared in the m_spatial_io at this point
         const auto& spatial = comp_model_desc.spatial.value();
         const auto num_outputs = comp_model_desc.compiled_model->outputs().size();
+        NPUW_ASSERT(m_spatial_selector);
 
         // Create a sparse vector with full input sizes.
         // For the access simplicity, its size is aligned with function's
@@ -940,6 +967,10 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
 
         std::size_t offset = 0u;
         for (std::size_t i = 0u; i < spatial.nway_iters; i++, offset += spatial.nway) {
+            if (!m_spatial_selector->need_submit(offset, spatial.nway)) {
+                continue;
+            }
+
             // Collect spatial inputs for this offset
             for (auto&& param : spatial.params) {
                 const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx];
@@ -963,7 +994,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
         }  // for(full_nway_times)
 
         // Now process the tail, if required
-        if (spatial.tail_size) {
+        if (spatial.tail_size && m_spatial_selector->need_submit(offset, spatial.tail_size)) {
             // Copy the sub-ranges to spatial inputs
             // NOTE: tails buffers are read from/written to at 0th offset!
             for (auto&& param : spatial.params) {
@@ -1085,7 +1116,7 @@ ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type
         return ov::get_tensor_impl(ov::Tensor(type, shape));
     }
 
-    std::lock_guard<std::mutex> guard(m_alloc_mutex);
+    // Protect access to shared context(s) - at least among infer requests
     auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
     auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
     return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
index 88838d8b39d75f..fb105fd7629c7c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -15,6 +15,7 @@
 #include "openvino/runtime/iremote_context.hpp"
 #include "openvino/runtime/make_tensor.hpp"
 #include "openvino/runtime/tensor.hpp"
+#include "spatial.hpp"
 
 namespace ov {
 namespace npuw {
@@ -148,8 +149,10 @@ class JustInferRequest final : public IBaseInferRequest {
     };
     std::vector<GlobalIO> m_subrequests_gio;
 
-    std::mutex m_alloc_mutex;
     std::unordered_set<void*> m_input_allocated;
+
+    // Represents spatial run-time info
+    runtime::spatial::Selector::Ptr m_spatial_selector;
 };
 
 }  // namespace npuw
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
index a66159e6b4d1b7..a06a6f3bd1ced5 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
@@ -267,12 +267,13 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
 // Interface to get online partitioning from the model
 class Compiler {
     enum class Pipeline {
-        NONE,    // Partitioning will consist of a single group with all the Ops
-        INIT,    // Initialize only. The hardest mode, every group has just 1 layer inside
-        JUST,    // "justParitioning" - combination of LHF + Remnants
-        REP,     // Repeated blocks pipeline - combination of repeatedBlocks and Remnants
-        REG,     // Regularized repeated blocks pipeline -same as REP, but with some strong hints first
-        COMPUTE  // Separates non-foldable compute subgraphs from the model based on predefined rules + REP
+        NONE,     // Partitioning will consist of a single group with all the Ops
+        INIT,     // Initialize only. The hardest mode, every group has just 1 layer inside
+        JUST,     // "justParitioning" - combination of LHF + Remnants
+        REP,      // Repeated blocks pipeline - combination of repeatedBlocks and Remnants
+        REG,      // Regularized repeated blocks pipeline - same as REP, but with some strong hints first
+        COMPUTE,  // Separates non-foldable compute subgraphs from the model based on predefined rules + REP
+        SPATIAL   // Similar to COMPUTE but allows folding
     };
 
     template <class C>
@@ -299,6 +300,8 @@ class Compiler {
             return Pipeline::REG;
         } else if (pipeline_opt == "COMPUTE") {
             return Pipeline::COMPUTE;
+        } else if (pipeline_opt == "SPATIAL") {
+            return Pipeline::SPATIAL;
         } else {
             LOG_WARN("Unknown partitioning compiler pipeline " << pipeline_opt << ", switching to REP");
             return Pipeline::REP;
@@ -428,6 +431,16 @@ class Compiler {
             m_snapshot->setCtx(ctx);
             rep();
             break;
+        case Pipeline::SPATIAL:
+            warn_unused<::intel_npu::NPUW_ONLINE_ISOLATE>();
+            m_cfg.update(::intel_npu::Config::ConfigMap{{std::string(::intel_npu::NPUW_SPATIAL::key()), "YES"}});
+
+            // Manually set predefined isolates and nofolds then do rep() pipeline
+            // FIXME: initialize via a dedicated function instead of parsing
+            ctx.isolates = detail::getIsolates(detail::ISOL_PRESETS.at("COMPUTE"));
+            m_snapshot->setCtx(ctx);
+            rep();
+            break;
         }
 
         LOG_DEBUG("Online partitioning: group sizes after compilation:");
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index 5e3f12fedf68a6..f12350e8952eaa 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -1605,7 +1605,7 @@ void Partitioner::identifySpatialRange(ov::npuw::Function& f) {
     const auto& f_params = f._model->get_parameters();
     NPUW_ASSERT(f_params.size() > 0);
 
-    using S = ov::npuw::Function::Spatial;
+    using S = ov::npuw::function::Spatial;
     S spatial;
     spatial._range = f_result_0_shape[1];
     spatial._out_dim = 1;  // the only case we're looking into now
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
index f60c6eff62e96b..5343ba26e6e5aa 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
@@ -10,6 +10,7 @@
 #include <vector>
 
 #include "../lazy_tensor.hpp"
+#include "../spatial.hpp"
 #include "intel_npu/al/config/config.hpp"
 #include "openvino/openvino.hpp"
 
@@ -70,20 +71,7 @@ struct Function {
     // NOTE: it seems it is required only for `matchRepeatedSubgraphs()'
     std::map<std::pair<std::string, std::size_t>, std::size_t> _param_mapping;
 
-    // Spatial information. So far assume spatial execution in 1 dimension only
-    struct Spatial {
-        using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
-        struct Param {
-            PPtr param;
-            std::size_t dim;
-        };
-        std::size_t _range = 0u;    // Range over which spatial execution is organized, e.g. 1024
-        std::size_t _slice = 0u;    // A submission size for a single execution, e.g. 128
-        std::size_t _out_dim = 0u;  // Assume it is the same dim for all Results
-        std::vector<Param> _inputs;
-    };
-    using SpatialOpt = std::optional<Spatial>;
-    SpatialOpt _spatial;
+    std::optional<ov::npuw::function::Spatial> _spatial;
 };
 
 struct Group {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/spatial.cpp b/src/plugins/intel_npu/src/plugin/npuw/spatial.cpp
new file mode 100644
index 00000000000000..a7ea56dd3ff910
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/spatial.cpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "spatial.hpp"
+
+#include "util.hpp"
+
+ov::npuw::runtime::spatial::AttentionMask::AttentionMask(std::size_t param_idx, const ov::ISyncInferRequest& rq)
+    : m_attn_mask_param_idx(param_idx),
+      m_rq(rq) {}
+
+ov::npuw::runtime::spatial::Selector::Ptr ov::npuw::runtime::spatial::AttentionMask::find(
+    const ov::ISyncInferRequest& rq) {
+    auto is_attn_mask = [](const ov::Output<const ov::Node>& p) {
+        const auto shape = p.get_shape();
+        return p.get_node()->get_friendly_name() == "attention_mask" &&
+               (shape.size() == 1 || (shape.size() == 2 && shape[0] == 1));
+    };
+
+    const auto& inputs = rq.get_inputs();
+    auto attn_mask_iter = std::find_if(inputs.begin(), inputs.end(), is_attn_mask);
+    if (attn_mask_iter != inputs.end()) {
+        const auto param_idx = std::distance(inputs.begin(), attn_mask_iter);
+        return Selector::Ptr{new AttentionMask(param_idx, rq)};
+    }
+    return Selector::Ptr{};
+}
+
+void ov::npuw::runtime::spatial::AttentionMask::prepare() {
+    // Find the current valid range for this attention mask
+    // Here we have the following (very strong) assumption:
+    // The attention mask is dense (that is, has zero or one continuous interest region)
+    const auto& iport = m_rq.get_compiled_model()->inputs()[m_attn_mask_param_idx];
+    std::tie(m_valid_range_begin, m_valid_range_end) = ov::npuw::util::validMaskRange(m_rq.get_tensor(iport));
+}
+
+bool ov::npuw::runtime::spatial::AttentionMask::need_submit(std::size_t offset, std::size_t len) const {
+    // We don't submit this request if
+    // - it is completely below the valid range
+    // - it is completely above the valid range
+    // in all other cases, we do
+    return !(offset + len < m_valid_range_begin || offset >= m_valid_range_end);
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp b/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp
new file mode 100644
index 00000000000000..fce2f63db4e807
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp
@@ -0,0 +1,106 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "openvino/openvino.hpp"
+#include "openvino/runtime/icompiled_model.hpp"
+#include "openvino/runtime/isync_infer_request.hpp"
+
+namespace ov {
+namespace npuw {
+
+namespace function {
+
+// Partition-time spatial information. So far assume spatial execution in 1 dimension only
+// Defined at this level to be aligned with other partitioning entities (but needs to be moved)
+struct Spatial {
+    using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
+    struct Param {
+        PPtr param;
+        std::size_t dim;
+    };
+    std::size_t _range = 0u;    // Range over which spatial execution is organized, e.g. 1024
+    std::size_t _slice = 0u;    // A submission size for a single execution, e.g. 128
+    std::size_t _out_dim = 0u;  // Assume it is the same dim for all Results
+    std::vector<Param> _inputs;
+};
+
+}  // namespace function
+
+namespace compiled {
+
+// Compile-time spatial information. Not much different from the above
+struct Spatial {
+    struct Param {
+        std::size_t idx;  // function input index for this spatial parameter
+        std::size_t dim;
+    };
+    std::vector<Param> params;
+    std::size_t range = 0u;    // NB: duplication of the above
+    std::size_t nway = 0u;     // NB: duplication of the above
+    std::size_t out_dim = 0u;  // NB: duplication of the above
+
+    std::size_t nway_iters = 0u;
+    std::size_t tail_size = 0u;
+
+    Spatial(const function::Spatial& s, const std::shared_ptr<ov::Model>& m)
+        : range(s._range),
+          nway(s._slice),
+          out_dim(s._out_dim),
+          nway_iters(range / nway),
+          tail_size(range % nway) {
+        for (auto&& input : s._inputs) {
+            std::size_t p_idx = m->get_parameter_index(input.param);
+            params.push_back(Param{p_idx, input.dim});
+        }
+    }
+};
+
+}  // namespace compiled
+
+namespace runtime {
+namespace spatial {
+
+// A base class to decide the work-scope from some feature
+class Selector {
+public:
+    using Ptr = std::shared_ptr<Selector>;
+    virtual ~Selector() = default;
+    virtual void prepare() = 0;
+    virtual bool need_submit(std::size_t offset, std::size_t len) const = 0;
+};
+
+// No dynamic dispatch - just run over the whole range
+class All final : public Selector {
+    void prepare() override {}
+    bool need_submit(std::size_t, std::size_t) const override {
+        return true;
+    }
+};
+
+// Define work scope based on attention mask
+class AttentionMask final : public Selector {
+    std::size_t m_attn_mask_param_idx = 0u;
+    std::size_t m_valid_range_begin = 0u;
+    std::size_t m_valid_range_end = 0u;
+
+    const ov::ISyncInferRequest& m_rq;
+
+    AttentionMask(std::size_t param_idx, const ov::ISyncInferRequest& rq);
+    void prepare() override;
+    bool need_submit(std::size_t offset, std::size_t len) const override;
+
+public:
+    static Selector::Ptr find(const ov::ISyncInferRequest& rq);
+};
+
+}  // namespace spatial
+}  // namespace runtime
+
+}  // namespace npuw
+}  // namespace ov
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
index 1de8f4de4bdb4f..da62d040c06095 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
@@ -518,3 +518,40 @@ ov::Tensor ov::npuw::util::concat(const std::vector<ov::Tensor>& tt, std::size_t
         NPUW_ASSERT(false && "Not supported yet");
     }
 }
+
+namespace {
+template <typename T>
+ov::npuw::util::range_1d validMaskRange(const T* data, std::size_t len) {
+    using R = ov::npuw::util::range_1d;
+    std::size_t range_begin = 0u;
+    bool was_set = false;
+
+    for (std::size_t idx = 0u; idx < len; idx++) {
+        const bool is_set = static_cast<std::size_t>(data[idx] > 0);
+
+        if (is_set && !was_set) {
+            was_set = true;
+            range_begin = idx;
+        } else if (!is_set && was_set) {
+            return R{range_begin, idx};
+        }
+    }
+    return was_set ? R{range_begin, len} : R{0u, 0u};
+}
+}  // namespace
+
+ov::npuw::util::range_1d ov::npuw::util::validMaskRange(const ov::SoPtr<ov::ITensor>& src) {
+    NPUW_ASSERT(src->is_continuous());
+
+    namespace ove = ov::element;
+#define HNDL(t, T) \
+    case ove::t:   \
+        return ::validMaskRange(static_cast<const T*>(src->data()), src->get_size());
+    switch (src->get_element_type()) {
+        HNDL(i64, int64_t);
+        HNDL(i32, int32_t);
+    default:
+        OPENVINO_THROW("Unsupported type ", src->get_element_type());
+    }
+#undef HNDL
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
index 02d2c8c097811e..a826d00e032977 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
@@ -64,6 +64,10 @@ ov::Tensor transpose(const ov::Tensor& t);
 ov::Tensor permute(const ov::Tensor& t, const std::vector<std::size_t>& axes);
 ov::Tensor concat(const std::vector<ov::Tensor>& tt, std::size_t axis);
 
+// Start is inclusive, end is exclusive
+using range_1d = std::pair<std::size_t, std::size_t>;
+range_1d validMaskRange(const ov::SoPtr<ov::ITensor>& t);
+
 namespace at {
 template <class M_>
 struct Impl {

From ac7cb8b62065adc18ed17a81b668f87ad0ca48c3 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Fri, 18 Oct 2024 09:14:47 +0400
Subject: [PATCH 13/32] [GPU] Save use_onednn attribute in the blob (#27097)

### Details:
- This is needed to have correct runtime impl selection of imported
model

### Tickets:
 - *CVS-154891*
---
 src/plugins/intel_gpu/src/graph/program.cpp   |  4 ++
 .../tests/unit/test_cases/gemm_gpu_test.cpp   | 51 ++++++++++++++-----
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 3a3793e8ad764d..d4461b8aad9107 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -1776,6 +1776,7 @@ void program::save(cldnn::BinaryOutputBuffer& ob) const {
 
     ob << _is_body_program;
     ob << _can_be_optimized;
+    ob << get_layout_optimizer().get_optimization_attributes().use_onednn_impls;
     processing_order.save(ob);
 
     {
@@ -1895,6 +1896,9 @@ void program::load(cldnn::BinaryInputBuffer& ib) {
 
     ib >> _is_body_program;
     ib >> _can_be_optimized;
+    int32_t use_onednn_attr = 0;
+    ib >> use_onednn_attr;
+    get_layout_optimizer().set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, use_onednn_attr);
     _loaded_from_cache = true;
 
     processing_order.load(ib, *this);
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
index 51f66f3abb7bfe..3b41f44050e527 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
@@ -15,6 +15,7 @@
 #include "intel_gpu/runtime/compilation_context.hpp"
 #include "gemm_inst.h"
 #include "permute_inst.h"
+#include "layout_optimizer.h"
 
 #include <cstddef>
 #include <vector>
@@ -625,7 +626,7 @@ class gemm_gpu_tests: public ::testing::Test {
             topology topology;
             topology.add(input_layout("input1", in1_layout),
                         input_layout("input2", in2_layout),
-                        gemm("gemm_ref", { input_info("input1"), input_info("input2") }, data_types::f16, 
+                        gemm("gemm_ref", { input_info("input1"), input_info("input2") }, data_types::f16,
                              {0, 2, 1, 3}, {0, 2, 3, 1}, {0, 1, 2, 3})
             );
 
@@ -652,7 +653,7 @@ class gemm_gpu_tests: public ::testing::Test {
         topology topology;
         topology.add(input_layout("input1", in1_layout),
                      input_layout("input2", in2_layout),
-                     gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f16, 
+                     gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f16,
                              {0, 2, 1, 3}, {0, 2, 3, 1}, {0, 1, 2, 3})
         );
 
@@ -2789,7 +2790,7 @@ INSTANTIATE_TEST_SUITE_P(gemm_gpu, gemm_onednn_ndims, ::testing::ValuesIn(std::v
 
 class gemm_onednn: public ::testing::Test {
 public:
-    void test_impl_replacement_with_cldnn() {
+    void test_impl_replacement_with_cldnn(bool is_caching_test) {
         auto& engine = get_test_engine();
 
         if (!engine.get_device_info().supports_immad)
@@ -2828,16 +2829,34 @@ class gemm_onednn: public ::testing::Test {
                              ov::intel_gpu::optimize_data(true),
                              ov::intel_gpu::allow_new_shape_infer(true) };
 
-        network network(engine, topology, cfg);
-        network.set_input_data("input1", input1);
-        network.set_input_data("input2", input2);
+        cldnn::network::ptr network;
+        if (is_caching_test) {
+            membuf mem_buf;
+            {
+                std::ostream out_mem(&mem_buf);
+                BinaryOutputBuffer ob = BinaryOutputBuffer(out_mem);
+                ob.set_stream(get_test_stream_ptr().get());
+                program::build_program(engine, topology, cfg)->save(ob);
+            }
+            {
+                std::istream in_mem(&mem_buf);
+                BinaryInputBuffer ib = BinaryInputBuffer(in_mem, engine);
+                auto imported_prog = std::make_shared<cldnn::program>(engine, cfg);
+                imported_prog->load(ib);
+                network = std::make_shared<cldnn::network>(imported_prog);
+            }
+        } else {
+            network = std::make_shared<cldnn::network>(engine, topology, cfg);
+        }
+        network->set_input_data("input1", input1);
+        network->set_input_data("input2", input2);
 
-        auto inst = network.get_primitive("gemm");
+        auto inst = network->get_primitive("gemm");
         auto impl = inst->get_impl();
         ASSERT_TRUE(impl != nullptr);
         ASSERT_TRUE(impl->is_dynamic());
 
-        auto outputs = network.execute();
+        auto outputs = network->execute();
 
         auto output = outputs.at("gemm").get_memory();
         cldnn::mem_lock<ov::float16> output_ptr(output, get_test_stream());
@@ -2847,12 +2866,15 @@ class gemm_onednn: public ::testing::Test {
             ASSERT_FLOAT_EQ(output_ptr[i], out_data[i]);
         }
 
-        // WA: Call wait_all() to wait for all queued kernels compilation finish
-        network.get_program()->get_compilation_context().wait_all();
+        // Call wait_all() to wait for all queued kernels compilation finish
+        network->get_program()->get_compilation_context().wait_all();
+
+        auto& lo = network->get_program()->get_layout_optimizer();
+        ASSERT_TRUE(lo.get_optimization_attributes().use_onednn_impls);
 
         // Check if OneDNN's impl is used for the next execute() call
-        network.execute();
-        inst = network.get_primitive("gemm");
+        network->execute();
+        inst = network->get_primitive("gemm");
         impl = inst->get_impl();
         ASSERT_TRUE(impl != nullptr);
         ASSERT_FALSE(impl->is_dynamic());
@@ -3214,7 +3236,10 @@ class gemm_onednn: public ::testing::Test {
 };
 
 TEST_F(gemm_onednn, impl_replacement_with_cldnn) {
-    this->test_impl_replacement_with_cldnn();
+    this->test_impl_replacement_with_cldnn(false);
+}
+TEST_F(gemm_onednn, impl_replacement_with_cldnn_cached) {
+    this->test_impl_replacement_with_cldnn(true);
 }
 
 // Check gemm_onednn transpose_format() can accept transpose white list format (byfx/bxfy)

From 4bf52c13d005649e9583b426624613eadebf89cf Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Fri, 18 Oct 2024 09:16:43 +0400
Subject: [PATCH 14/32] [GPU] Fix USMHost tensor sharing between models from
 different Cores (#27105)

### Details:
 - Treat USMHost tensor from another context as non-sharable
---
 .../src/plugin/sync_infer_request.cpp         |  4 +--
 .../functional/behavior/infer_request.cpp     | 26 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
index 88d69dcd3e47b3..58e99e037fb931 100644
--- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -547,7 +547,7 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe
 
     bool can_share = !is_convert_required(user_tensor->get_element_type(), element_type) && can_use_usm_host(engine) && !generic_remote_tensor;
 
-    if (usm_host_tensor && can_share) {
+    if (usm_host_tensor && can_share && m_context == usm_host_tensor->get_impl()->get_context()) {
         return { usm_host_tensor->get_impl(), user_tensor_wrapper.owner };
     } else if (usm_host_raw_ptr && can_share) {
         return { std::make_shared<RemoteTensorImpl>(m_context,
@@ -727,7 +727,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
     auto usm_host_ptr = std::dynamic_pointer_cast<USMHostTensor>(user_tensor);
     bool is_generic_remote = iremote_tensor_ptr != nullptr && remote_tensor_impl_ptr == nullptr;
     bool is_remote_tensor_impl = remote_tensor_impl_ptr != nullptr;
-    bool is_usm_host_tensor = usm_host_ptr != nullptr;
+    bool is_usm_host_tensor = usm_host_ptr != nullptr && usm_host_ptr->get_impl()->get_context() == m_context;
 
     GPU_DEBUG_TRACE_DETAIL << "Prepare input for " << internal_name
                            << " (is_remote_tensor_impl ? " << is_remote_tensor_impl
diff --git a/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp b/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp
index d82384f1eb8366..201c91fe9a60c3 100644
--- a/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp
+++ b/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp
@@ -6,7 +6,11 @@
 #include "common_test_utils/test_common.hpp"
 #include "common_test_utils/common_utils.hpp"
 #include "common_test_utils/node_builders/activation.hpp"
+#include "openvino/core/partial_shape.hpp"
 #include "openvino/core/preprocess/pre_post_process.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/relu.hpp"
+#include "openvino/op/result.hpp"
 #include "openvino/runtime/core.hpp"
 #include "transformations/utils/utils.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
@@ -369,4 +373,26 @@ TEST(TensorTest, smoke_outputTensorShapesForDynamicInput) {
     OV_ASSERT_NO_THROW(inf_req.infer());
     ASSERT_EQ(inf_req.get_output_tensor().get_shape(), output3_shape);
 }
+
+TEST(TensorTest, smoke_canShareTensorIfModelsFromDifferentCores) {
+    auto core1 = ov::Core();
+    auto core2 = ov::Core();
+
+    auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{4, 8});
+    auto relu = std::make_shared<ov::op::v0::Relu>(param);
+    auto result = std::make_shared<ov::op::v0::Result>(relu);
+    auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{param});
+
+    auto compiled_model1 = core1.compile_model(model, ov::test::utils::DEVICE_GPU);
+    auto compiled_model2 = core2.compile_model(model, ov::test::utils::DEVICE_GPU);
+
+    auto request1 = compiled_model1.create_infer_request();
+    auto request2 = compiled_model2.create_infer_request();
+
+    request2.set_input_tensor(request1.get_output_tensor());
+    request2.set_output_tensor(request1.get_input_tensor());
+
+    OV_ASSERT_NO_THROW(request1.infer());
+    OV_ASSERT_NO_THROW(request2.infer());
+}
 } // namespace

From 939b35a96293bf9b02a4eb8732632c3700f46ce5 Mon Sep 17 00:00:00 2001
From: Dan Liu <dan1.liu@intel.com>
Date: Thu, 17 Oct 2024 22:24:18 -0700
Subject: [PATCH 15/32] [NPU]Change NPUBackend log (#27073)

### Details:
[log.error of
NPUbackend](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/src/plugin/src/backends.cpp#L130)
will confuse the user, thought NPU backend does not impact the
compilation and just impact the inference stage.
Now [in the inference stage in compiledmodel
part](https://github.com/openvinotoolkit/openvino/pull/27073/files#diff-74bc81bb7b258118f04e81468e3ec3b05e65e714546d32246bae45eb892f6abcR125-R130),
will get a log.error output when no npu device is checked.


### Tickets:
 - 153439
---
 src/plugins/intel_npu/src/plugin/src/backends.cpp     |  5 +++--
 .../intel_npu/src/plugin/src/compiled_model.cpp       | 11 ++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/src/backends.cpp b/src/plugins/intel_npu/src/plugin/src/backends.cpp
index 1019cff3287995..9b090e4ec91529 100644
--- a/src/plugins/intel_npu/src/plugin/src/backends.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/backends.cpp
@@ -111,7 +111,7 @@ NPUBackends::NPUBackends(const std::vector<AvailableBackends>& backendRegistry,
         } catch (const std::exception& ex) {
             _logger.warning("Got an error during backend '%s' loading : %s", backendName.c_str(), ex.what());
         } catch (...) {
-            _logger.error("Got an unknown error during backend '%s' loading", backendName.c_str());
+            _logger.warning("Got an unknown error during backend '%s' loading", backendName.c_str());
         }
     }
 
@@ -127,7 +127,8 @@ NPUBackends::NPUBackends(const std::vector<AvailableBackends>& backendRegistry,
     if (_backend != nullptr) {
         _logger.info("Use '%s' backend for inference", _backend->getName().c_str());
     } else {
-        _logger.error("Cannot find backend for inference. Make sure the device is available.");
+        _logger.warning("None of the backends were initialized successfully."
+                        "Only offline compilation can be done!");
     }
 }
 
diff --git a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp
index 51ed0e2c5c4858..91aa19499d9de5 100644
--- a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp
@@ -24,6 +24,9 @@
 namespace {
 
 constexpr std::string_view NO_EXECUTOR_FOR_INFERENCE =
+    "Can't create infer request due to create executor failed! Only exports can be made.";
+
+constexpr std::string_view NO_EXECUTOR_FOR_INFERENCE_NODEVICE =
     "Can't create infer request!\n"
     "Please make sure that the device is available. Only exports can be made.";
 
@@ -118,8 +121,14 @@ std::shared_ptr<ov::IAsyncInferRequest> CompiledModel::create_infer_request() co
     if (_executorPtr == nullptr && _device != nullptr) {
         _executorPtr = _device->createExecutor(_networkPtr, _config);
     }
+
     if (_executorPtr == nullptr) {
-        OPENVINO_THROW(NO_EXECUTOR_FOR_INFERENCE);
+        if (_device != nullptr) {
+            OPENVINO_THROW(NO_EXECUTOR_FOR_INFERENCE);
+        } else {
+            _logger.error("Can not find device!");
+            OPENVINO_THROW(NO_EXECUTOR_FOR_INFERENCE_NODEVICE);
+        }
     }
 
     const std::shared_ptr<SyncInferRequest>& syncInferRequest =

From a8293f3b0f94036d757050f6792a599a04be4f53 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 18 Oct 2024 07:04:38 +0000
Subject: [PATCH 16/32] Bump actions/upload-artifact from 4.4.0 to 4.4.3
 (#27113)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps
[actions/upload-artifact](https://github.com/actions/upload-artifact)
from 4.4.0 to 4.4.3.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/actions/upload-artifact/releases">actions/upload-artifact's
releases</a>.</em></p>
<blockquote>
<h2>v4.4.3</h2>
<h2>What's Changed</h2>
<ul>
<li>Undo indirect dependency updates from <a
href="https://redirect.github.com/actions/upload-artifact/issues/627">#627</a>
by <a href="https://github.com/joshmgross"><code>@​joshmgross</code></a>
in <a
href="https://redirect.github.com/actions/upload-artifact/pull/632">actions/upload-artifact#632</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/actions/upload-artifact/compare/v4.4.2...v4.4.3">https://github.com/actions/upload-artifact/compare/v4.4.2...v4.4.3</a></p>
<h2>v4.4.2</h2>
<h2>What's Changed</h2>
<ul>
<li>Bump <code>@actions/artifact</code> to 2.1.11 by <a
href="https://github.com/robherley"><code>@​robherley</code></a> in <a
href="https://redirect.github.com/actions/upload-artifact/pull/627">actions/upload-artifact#627</a>
<ul>
<li>Includes fix for relative symlinks not resolving properly</li>
</ul>
</li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/actions/upload-artifact/compare/v4.4.1...v4.4.2">https://github.com/actions/upload-artifact/compare/v4.4.1...v4.4.2</a></p>
<h2>v4.4.1</h2>
<h2>What's Changed</h2>
<ul>
<li>Add a section about hidden files by <a
href="https://github.com/joshmgross"><code>@​joshmgross</code></a> in <a
href="https://redirect.github.com/actions/upload-artifact/pull/607">actions/upload-artifact#607</a></li>
<li>Add workflow file for publishing releases to immutable action
package by <a
href="https://github.com/Jcambass"><code>@​Jcambass</code></a> in <a
href="https://redirect.github.com/actions/upload-artifact/pull/621">actions/upload-artifact#621</a></li>
<li>Update <code>@​actions/artifact</code> to latest version, includes
symlink and timeout fixes by <a
href="https://github.com/robherley"><code>@​robherley</code></a> in <a
href="https://redirect.github.com/actions/upload-artifact/pull/625">actions/upload-artifact#625</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/Jcambass"><code>@​Jcambass</code></a>
made their first contribution in <a
href="https://redirect.github.com/actions/upload-artifact/pull/621">actions/upload-artifact#621</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/actions/upload-artifact/compare/v4.4.0...v4.4.1">https://github.com/actions/upload-artifact/compare/v4.4.0...v4.4.1</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/actions/upload-artifact/commit/b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882"><code>b4b15b8</code></a>
Merge pull request <a
href="https://redirect.github.com/actions/upload-artifact/issues/632">#632</a>
from actions/joshmgross/undo-dependency-changes</li>
<li><a
href="https://github.com/actions/upload-artifact/commit/92b01ebffaf2e2520c64ab2845d3f9bd5c06941a"><code>92b01eb</code></a>
Undo indirect dependency updates from <a
href="https://redirect.github.com/actions/upload-artifact/issues/627">#627</a></li>
<li><a
href="https://github.com/actions/upload-artifact/commit/84480863f228bb9747b473957fcc9e309aa96097"><code>8448086</code></a>
Merge pull request <a
href="https://redirect.github.com/actions/upload-artifact/issues/627">#627</a>
from actions/robherley/v4.4.2</li>
<li><a
href="https://github.com/actions/upload-artifact/commit/b1d4642b699cfe7e338a864cc36849b29ad04a75"><code>b1d4642</code></a>
add explicit relative and absolute symlinks to workflow</li>
<li><a
href="https://github.com/actions/upload-artifact/commit/d50e66084c4d29dc5d3326b7a0e67bed9ef4bb1e"><code>d50e660</code></a>
bump version</li>
<li><a
href="https://github.com/actions/upload-artifact/commit/aabe6f8050b860cae7a9065282dde2b3227836aa"><code>aabe6f8</code></a>
build with <code>@​actions/artifact</code> v2.1.11</li>
<li><a
href="https://github.com/actions/upload-artifact/commit/604373da6381bf24206979c74d06a550515601b9"><code>604373d</code></a>
Merge pull request <a
href="https://redirect.github.com/actions/upload-artifact/issues/625">#625</a>
from actions/robherley/artifact-2.1.10</li>
<li><a
href="https://github.com/actions/upload-artifact/commit/0150148bdf458be2451ee90b000ecdcca8216ed8"><code>0150148</code></a>
paste right core version</li>
<li><a
href="https://github.com/actions/upload-artifact/commit/a009b25faa61b2b26de294984570f1371b13a895"><code>a009b25</code></a>
update licenses</li>
<li><a
href="https://github.com/actions/upload-artifact/commit/9f6f6f402e14cb0fe462513c8fa31e6ec061e8b5"><code>9f6f6f4</code></a>
update <code>@​actions/core</code> and <code>@​actions/artifact</code>
to latest versions</li>
<li>Additional commits viewable in <a
href="https://github.com/actions/upload-artifact/compare/50769540e7f4bd5e21e526ee35c689e35e0d6874...b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/upload-artifact&package-manager=github_actions&previous-version=4.4.0&new-version=4.4.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/android_arm64.yml              |  2 +-
 .github/workflows/android_x64.yml                |  2 +-
 .github/workflows/build_doc.yml                  |  6 +++---
 .github/workflows/coverity.yml                   |  4 ++--
 .../workflows/dev_cpu_linux_snippets_libxsmm.yml |  8 ++++----
 .github/workflows/job_build_linux.yml            | 16 ++++++++--------
 .github/workflows/job_build_windows.yml          |  8 ++++----
 .github/workflows/job_cpu_functional_tests.yml   |  2 +-
 .github/workflows/job_cxx_unit_tests.yml         |  2 +-
 .github/workflows/job_gpu_tests.yml              |  2 +-
 .github/workflows/job_jax_models_tests.yml       |  2 +-
 .github/workflows/job_onnx_models_tests.yml      |  2 +-
 .github/workflows/job_python_unit_tests.yml      |  2 +-
 .github/workflows/job_pytorch_layer_tests.yml    |  2 +-
 .github/workflows/job_pytorch_models_tests.yml   |  2 +-
 .github/workflows/job_tensorflow_layer_tests.yml |  2 +-
 .../workflows/job_tensorflow_models_tests.yml    |  2 +-
 .github/workflows/job_tokenizers.yml             |  2 +-
 .../workflows/linux_conditional_compilation.yml  |  8 ++++----
 .github/workflows/linux_sanitizers.yml           |  6 +++---
 .github/workflows/mac.yml                        |  8 ++++----
 .github/workflows/mac_arm64.yml                  |  8 ++++----
 .github/workflows/py_checks.yml                  |  6 +++---
 .github/workflows/ubuntu_22.yml                  |  4 ++--
 .../windows_conditional_compilation.yml          |  6 +++---
 .github/workflows/windows_vs2019_release.yml     |  4 ++--
 26 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/android_arm64.yml b/.github/workflows/android_arm64.yml
index 6a67e8f6793ec9..15094a84ee8f5f 100644
--- a/.github/workflows/android_arm64.yml
+++ b/.github/workflows/android_arm64.yml
@@ -178,7 +178,7 @@ jobs:
       # Upload build logs
       #
       - name: Upload build logs
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: build_logs
diff --git a/.github/workflows/android_x64.yml b/.github/workflows/android_x64.yml
index cab5239b4c45c0..cebaa9177b69b9 100644
--- a/.github/workflows/android_x64.yml
+++ b/.github/workflows/android_x64.yml
@@ -152,7 +152,7 @@ jobs:
       # Upload build logs
       #
       - name: Upload build logs
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: build_logs
diff --git a/.github/workflows/build_doc.yml b/.github/workflows/build_doc.yml
index 53f3eba9a749bf..535be1e4e70457 100644
--- a/.github/workflows/build_doc.yml
+++ b/.github/workflows/build_doc.yml
@@ -77,13 +77,13 @@ jobs:
           echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
 
       - name: 'Upload sphinx.log'
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: sphinx_build_log_${{ env.PR_NUMBER }}.log
           path: build/docs/sphinx.log
 
       - name: 'Upload docs html'
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_docs_html_${{ env.PR_NUMBER }}.zip
           path: build/docs/openvino_docs_html.zip
@@ -100,7 +100,7 @@ jobs:
 
       - name: 'Upload test results'
         if: failure()
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_docs_pytest
           path: build/docs/_artifacts/
diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml
index 6a163fb5e50043..8a2338554faae3 100644
--- a/.github/workflows/coverity.yml
+++ b/.github/workflows/coverity.yml
@@ -144,7 +144,7 @@ jobs:
         run: ${COVERITY_TOOL_DIR}/cov-analysis*/bin/cov-configure -c ${COVERITY_TOOL_DIR}/cov-analysis-linux64-2023.6.2/config/coverity_config.xml -lscc text
 
       - name: Upload Coverity build log
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: coverity_logs
@@ -152,7 +152,7 @@ jobs:
           if-no-files-found: 'error'
 
       - name: Upload Coverity build archive
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: coverity_archive
diff --git a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml
index 83770900559bab..26e8400c22a04f 100644
--- a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml
+++ b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml
@@ -169,7 +169,7 @@ jobs:
       # Upload build artifacts and logs
       #
       - name: Upload build logs
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: build_logs
@@ -178,7 +178,7 @@ jobs:
 
       - name: Upload openvino package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
@@ -186,7 +186,7 @@ jobs:
 
       - name: Upload openvino tests package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_tests
           path: ${{ env.BUILD_DIR }}/openvino_tests.tar.gz
@@ -325,7 +325,7 @@ jobs:
         timeout-minutes: 25
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: test-results-functional-cpu
diff --git a/.github/workflows/job_build_linux.yml b/.github/workflows/job_build_linux.yml
index b8eea4375e7e58..86545b6e9e7a43 100644
--- a/.github/workflows/job_build_linux.yml
+++ b/.github/workflows/job_build_linux.yml
@@ -249,7 +249,7 @@ jobs:
       # Upload build artifacts and logs
       #
       - name: Upload build logs
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: build_logs
@@ -258,7 +258,7 @@ jobs:
 
       - name: Upload openvino package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
@@ -266,7 +266,7 @@ jobs:
           
       - name: Upload openvino wheels
         if: ${{ inputs.os != 'debian_10' && inputs.arch != 'arm' }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_wheels
           path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl
@@ -274,7 +274,7 @@ jobs:
         
       - name: Upload openvino js package
         if: ${{ fromJSON(inputs.affected-components).JS_API && inputs.build-js }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_js_package
           path: ${{ env.INSTALL_DIR_JS }}
@@ -282,7 +282,7 @@ jobs:
 
       - name: Upload openvino developer package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_developer_package
           path: ${{ env.BUILD_DIR }}/openvino_developer_package.tar.gz
@@ -290,7 +290,7 @@ jobs:
 
       - name: Upload openvino RPM packages
         if: ${{ inputs.build-rpm-packages }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_rpm_packages
           path: ${{ env.BUILD_DIR }}/*.rpm
@@ -298,7 +298,7 @@ jobs:
 
       - name: Upload openvino debian packages
         if: ${{ inputs.build-debian-packages }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_debian_packages
           path: ${{ env.BUILD_DIR }}/*.deb
@@ -306,7 +306,7 @@ jobs:
 
       - name: Upload openvino tests package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_tests
           path: ${{ env.BUILD_DIR }}/openvino_tests.tar.gz
diff --git a/.github/workflows/job_build_windows.yml b/.github/workflows/job_build_windows.yml
index c8e249513a08f0..df2544d9d9e60a 100644
--- a/.github/workflows/job_build_windows.yml
+++ b/.github/workflows/job_build_windows.yml
@@ -218,21 +218,21 @@ jobs:
       #
 
       - name: Upload openvino package
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.zip
           if-no-files-found: 'error'
           
       - name: Upload openvino wheels
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_wheels
           path: ${{ env.BUILD_DIR }}/wheels/*.whl
           if-no-files-found: 'error'
 
       - name: Upload openvino tests package
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_tests
           path: ${{ env.BUILD_DIR }}/openvino_tests.zip
@@ -240,7 +240,7 @@ jobs:
 
       - name: Upload openvino js package
         if: ${{ fromJSON(inputs.affected-components).JS_API }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_js_package
           path: ${{ env.INSTALL_DIR_JS }}
diff --git a/.github/workflows/job_cpu_functional_tests.yml b/.github/workflows/job_cpu_functional_tests.yml
index 6848871df6e81e..e197d581f290a4 100644
--- a/.github/workflows/job_cpu_functional_tests.yml
+++ b/.github/workflows/job_cpu_functional_tests.yml
@@ -116,7 +116,7 @@ jobs:
           key: ${{ runner.os }}-${{ runner.arch }}-tests-functional-cpu-stamp-${{ github.sha }}
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: test-results-functional-cpu
diff --git a/.github/workflows/job_cxx_unit_tests.yml b/.github/workflows/job_cxx_unit_tests.yml
index 99c363d04d23a7..3f871151ccd282 100644
--- a/.github/workflows/job_cxx_unit_tests.yml
+++ b/.github/workflows/job_cxx_unit_tests.yml
@@ -257,7 +257,7 @@ jobs:
           ${{ env.INSTALL_TEST_DIR }}/ov_hetero_func_tests --gtest_print_time=1 --gtest_output=xml:${{ env.INSTALL_TEST_DIR }}/TEST-OVHeteroFuncTests.xml --gtest_filter="*smoke*"
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-cpp
diff --git a/.github/workflows/job_gpu_tests.yml b/.github/workflows/job_gpu_tests.yml
index 324e653c57ebab..b9862eac09cc05 100644
--- a/.github/workflows/job_gpu_tests.yml
+++ b/.github/workflows/job_gpu_tests.yml
@@ -128,7 +128,7 @@ jobs:
 
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: test-results-${{ inputs.test_type }}-${{ inputs.device }}
diff --git a/.github/workflows/job_jax_models_tests.yml b/.github/workflows/job_jax_models_tests.yml
index 9956a27f234b36..ea2669071386dd 100644
--- a/.github/workflows/job_jax_models_tests.yml
+++ b/.github/workflows/job_jax_models_tests.yml
@@ -100,7 +100,7 @@ jobs:
           TEST_DEVICE: CPU
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-jax-models-${{ inputs.model_scope }}
diff --git a/.github/workflows/job_onnx_models_tests.yml b/.github/workflows/job_onnx_models_tests.yml
index 321aa88d614310..c879f0cb6a1efc 100644
--- a/.github/workflows/job_onnx_models_tests.yml
+++ b/.github/workflows/job_onnx_models_tests.yml
@@ -112,7 +112,7 @@ jobs:
           python3 -m pytest --backend="CPU" --model_zoo_dir="${MODELS_SHARE_PATH}" ${INSTALL_TEST_DIR}/onnx/tests/tests_python/test_zoo_models.py -v -n auto --forked -k 'not _cuda' --model_zoo_xfail
 
       - name: Upload logs from pytest
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: onnx_models_tests_logs
diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml
index d63262c665d45c..1fafafd7623545 100644
--- a/.github/workflows/job_python_unit_tests.yml
+++ b/.github/workflows/job_python_unit_tests.yml
@@ -276,7 +276,7 @@ jobs:
             --ignore=${INSTALL_TEST_DIR}/pyopenvino/tests/test_utils/test_utils.py
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-python
diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml
index 95074dc84f1ff9..abf614c70cff4e 100644
--- a/.github/workflows/job_pytorch_layer_tests.yml
+++ b/.github/workflows/job_pytorch_layer_tests.yml
@@ -147,7 +147,7 @@ jobs:
           PYTORCH_TRACING_MODE: TORCHFX
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-python-pytorch-layers
diff --git a/.github/workflows/job_pytorch_models_tests.yml b/.github/workflows/job_pytorch_models_tests.yml
index a77c1318f3a0c8..74915f1d9b823f 100644
--- a/.github/workflows/job_pytorch_models_tests.yml
+++ b/.github/workflows/job_pytorch_models_tests.yml
@@ -171,7 +171,7 @@ jobs:
           df -h
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-torch-models-${{ inputs.model_scope }}
diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml
index ae6e91a00d1497..977b2e4f96af73 100644
--- a/.github/workflows/job_tensorflow_layer_tests.yml
+++ b/.github/workflows/job_tensorflow_layer_tests.yml
@@ -150,7 +150,7 @@ jobs:
           TEST_PRECISION: FP16
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-python-tf-layers
diff --git a/.github/workflows/job_tensorflow_models_tests.yml b/.github/workflows/job_tensorflow_models_tests.yml
index db34ec7b793551..0990eae3de6e7e 100644
--- a/.github/workflows/job_tensorflow_models_tests.yml
+++ b/.github/workflows/job_tensorflow_models_tests.yml
@@ -107,7 +107,7 @@ jobs:
           TEST_DEVICE: CPU
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-tensorflow-models-${{ inputs.model_scope }}
diff --git a/.github/workflows/job_tokenizers.yml b/.github/workflows/job_tokenizers.yml
index 238dbfec3a34eb..f7388eb98a2f3c 100644
--- a/.github/workflows/job_tokenizers.yml
+++ b/.github/workflows/job_tokenizers.yml
@@ -133,7 +133,7 @@ jobs:
 
       - name: Upload openvino tokenizers wheel
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_tokenizers_wheel
           path: ${{ env.EXTENSION_BUILD_DIR }}/*.whl
diff --git a/.github/workflows/linux_conditional_compilation.yml b/.github/workflows/linux_conditional_compilation.yml
index 7b5467b01ad73e..42d7810b9f1663 100644
--- a/.github/workflows/linux_conditional_compilation.yml
+++ b/.github/workflows/linux_conditional_compilation.yml
@@ -223,7 +223,7 @@ jobs:
       # Upload build artifacts and logs
       #
       - name: Upload build logs
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: always()
         with:
           name: build_logs
@@ -232,7 +232,7 @@ jobs:
 
       - name: Upload openvino package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
@@ -240,7 +240,7 @@ jobs:
 
       - name: Upload selective build statistics package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_selective_build_stat
           path: ${{ env.BUILD_DIR }}/openvino_selective_build_stat.tar.gz
@@ -248,7 +248,7 @@ jobs:
 
       - name: Upload OpenVINO tests package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_tests
           path: ${{ env.BUILD_DIR }}/openvino_tests.tar.gz
diff --git a/.github/workflows/linux_sanitizers.yml b/.github/workflows/linux_sanitizers.yml
index b23e67a0f2b30e..e1a71fe92dc1a3 100644
--- a/.github/workflows/linux_sanitizers.yml
+++ b/.github/workflows/linux_sanitizers.yml
@@ -188,7 +188,7 @@ jobs:
 
       - name: Upload openvino package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_package_${{ matrix.SANITIZER }}
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
@@ -196,7 +196,7 @@ jobs:
 
       - name: Upload openvino tests package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_tests_${{ matrix.SANITIZER }}
           path: ${{ env.BUILD_DIR }}/openvino_tests.tar.gz
@@ -465,7 +465,7 @@ jobs:
           ${INSTALL_TEST_DIR}/ov_hetero_func_tests --gtest_print_time=1 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-OVHeteroFuncTests.xml --gtest_filter="*smoke*"
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-cpp_${{ matrix.SANITIZER }}
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 6e3f344c6dd944..bddbaab134fa9c 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -205,14 +205,14 @@ jobs:
 
       - name: Upload openvino package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
           if-no-files-found: 'error'
           
       - name: Upload openvino wheels
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_wheels
           path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl
@@ -220,7 +220,7 @@ jobs:
 
       - name: Upload openvino tests package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_tests
           path: ${{ env.BUILD_DIR }}/openvino_tests.tar.gz
@@ -228,7 +228,7 @@ jobs:
 
       - name: Upload openvino js package
         if: fromJSON(needs.smart_ci.outputs.affected_components).JS_API
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_js_package
           path: ${{ env.INSTALL_DIR_JS }}
diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml
index 16658318de20d8..576eefde8c9b4a 100644
--- a/.github/workflows/mac_arm64.yml
+++ b/.github/workflows/mac_arm64.yml
@@ -206,14 +206,14 @@ jobs:
 
       - name: Upload openvino package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
           if-no-files-found: 'error'
           
       - name: Upload openvino wheels
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_wheels
           path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl
@@ -221,7 +221,7 @@ jobs:
 
       - name: Upload openvino tests package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_tests
           path: ${{ env.BUILD_DIR }}/openvino_tests.tar.gz
@@ -229,7 +229,7 @@ jobs:
 
       - name: Upload openvino js package
         if: fromJSON(needs.smart_ci.outputs.affected_components).JS_API
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_js_package
           path: ${{ env.INSTALL_DIR_JS }}
diff --git a/.github/workflows/py_checks.yml b/.github/workflows/py_checks.yml
index ae0625ce4a453c..2b0d3f2272787f 100644
--- a/.github/workflows/py_checks.yml
+++ b/.github/workflows/py_checks.yml
@@ -50,7 +50,7 @@ jobs:
           git diff > samples_diff.diff
         working-directory: samples/python
 
-      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+      - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: failure()
         with:
           name: samples_diff
@@ -68,7 +68,7 @@ jobs:
           git diff > pyopenvino_diff.diff
         working-directory: src/bindings/python/src/openvino
 
-      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+      - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: failure()
         with:
           name: pyopenvino_diff
@@ -86,7 +86,7 @@ jobs:
           git diff > wheel_diff.diff
         working-directory: src/bindings/python/wheel
 
-      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+      - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: failure()
         with:
           name: wheel_diff
diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml
index 2ebca2b059fdd2..5e5ac3c3482624 100644
--- a/.github/workflows/ubuntu_22.yml
+++ b/.github/workflows/ubuntu_22.yml
@@ -227,7 +227,7 @@ jobs:
 
       - name: Upload Conformance Artifacts
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: conformance_artifacts_${{ matrix.TEST_TYPE }}-${{ env.TEST_DEVICE }}
           path: ${{ env.CONFORMANCE_ARTIFACTS_DIR }}/conformance_artifacts.tar.gz
@@ -253,7 +253,7 @@ jobs:
 
       - name: Upload Conformance Artifacts
         if: ${{ matrix.TEST_TYPE == 'API' }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: conformance_artifacts_${{ matrix.TEST_TYPE }}-TEMPLATE
           path: ${{ env.CONFORMANCE_ARTIFACTS_DIR }}/conformance_artifacts.tar.gz
diff --git a/.github/workflows/windows_conditional_compilation.yml b/.github/workflows/windows_conditional_compilation.yml
index 9c026f01e47233..30b2ce2f20df38 100644
--- a/.github/workflows/windows_conditional_compilation.yml
+++ b/.github/workflows/windows_conditional_compilation.yml
@@ -249,7 +249,7 @@ jobs:
 
       - name: Upload selective build statistics package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_selective_build_stat
           path: ${{ env.BUILD_DIR }}/openvino_selective_build_stat.zip
@@ -257,7 +257,7 @@ jobs:
 
       - name: Upload OpenVINO tests package
         if: ${{ always() }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: openvino_tests
           path: ${{ env.BUILD_DIR }}/openvino_tests.zip
@@ -402,7 +402,7 @@ jobs:
         timeout-minutes: 60
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-functional-cpu
diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml
index 8cac2b88078d15..bce90165408815 100644
--- a/.github/workflows/windows_vs2019_release.yml
+++ b/.github/workflows/windows_vs2019_release.yml
@@ -391,7 +391,7 @@ jobs:
         run: python3 -m pytest -s ${{ env.INSTALL_TEST_DIR }}/ovc/unit_tests --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-OpenVinoConversion.xml
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-python
@@ -502,7 +502,7 @@ jobs:
           key: ${{ runner.os }}-tests-functional-cpu-stamp-${{ github.sha }}
 
       - name: Upload Test Results
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         if: ${{ !cancelled() }}
         with:
           name: test-results-functional-cpu

From e582f61a65ba430ae005f927695e3fe68b4f4f20 Mon Sep 17 00:00:00 2001
From: Luwei Zhou <luwei.zhou@intel.com>
Date: Fri, 18 Oct 2024 15:05:22 +0800
Subject: [PATCH 17/32] [Transformations] Fix exception when converting
 precision on Read_Value node without inputs. (#26829)

### Details:
- *Read value node without input source on FP16 precision would raise
exception. The PR fix this.*

### Tickets:
 - *CVS-153067*
---
 .../src/transformations/convert_precision.cpp | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp
index 54fb6a972b7387..6f5166dfd26760 100644
--- a/src/common/transformations/src/transformations/convert_precision.cpp
+++ b/src/common/transformations/src/transformations/convert_precision.cpp
@@ -29,7 +29,7 @@ bool fuse_type_to_parameter(const std::shared_ptr<ov::Node>& node,
                             bool convert_input_precision);
 
 // this function inserts Convert operations to 'data' input and outputs of `node`
-// to execute 'node' with the original type.
+// to execute 'node' with the original type. This function supports nodes with single output.
 bool wrap_into_original_type(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
 bool store_original_type_as_attribute(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
 
@@ -622,17 +622,20 @@ bool wrap_into_original_type(const std::shared_ptr<ov::Node>& node, const precis
 
     const auto& to = it->second;
     const auto& from = it->first;
-
-    auto convert_before = std::make_shared<ov::op::v0::Convert>(node->input_value(0), from);
-    node->input(0).replace_source_output(convert_before);
-    auto consumers = node->output(0).get_target_inputs();
-    auto convert_after = std::make_shared<ov::op::v0::Convert>(node, to);
-    for (auto& input : consumers) {
-        const auto consumer = input.get_node();
-        if (ov::is_type<ov::op::v0::Result>(consumer) || ov::is_type<ov::op::v0::Convert>(consumer)) {
-            continue;
+    if (node->get_input_size()) {
+        auto convert_before = std::make_shared<ov::op::v0::Convert>(node->input_value(0), from);
+        node->input(0).replace_source_output(convert_before);
+    }
+    if (node->get_output_size() == 1) {
+        auto consumers = node->output(0).get_target_inputs();
+        auto convert_after = std::make_shared<ov::op::v0::Convert>(node, to);
+        for (auto& input : consumers) {
+            const auto consumer = input.get_node();
+            if (ov::is_type<ov::op::v0::Result>(consumer) || ov::is_type<ov::op::v0::Convert>(consumer)) {
+                continue;
+            }
+            input.replace_source_output(convert_after);
         }
-        input.replace_source_output(convert_after);
     }
 
     return true;

From 5a65547fd398863d4bad41d66a2fa6838542734b Mon Sep 17 00:00:00 2001
From: Maxim Vafin <maxim.vafin@intel.com>
Date: Fri, 18 Oct 2024 16:05:29 +0200
Subject: [PATCH 18/32] [TESTS] Fix version for huggingface_hub (#27126)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 tests/requirements_pytorch | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch
index 0bda286eb83252..40e1f6f66f52e8 100644
--- a/tests/requirements_pytorch
+++ b/tests/requirements_pytorch
@@ -38,6 +38,9 @@ torchaudio==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64"
 wheel==0.44.0
 PyYAML==6.0.2
 kornia==0.7.3
+super-image==0.1.7
+# huggingface-hub required for super-image
+huggingface-hub==0.25.2
 
 # use latest released version once it's available
 git+https://github.com/huggingface/optimum-intel.git@main; python_version < "3.12"

From 62183ab7a695a6939bb82d7d9243ef6db77ef944 Mon Sep 17 00:00:00 2001
From: Andrzej Kopytko <andrzejx.kopytko@intel.com>
Date: Fri, 18 Oct 2024 14:24:37 +0200
Subject: [PATCH 19/32] [DOCS] Moved versioning to repo (#27128)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 docs/sphinx_setup/assets/versions_raw.js | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 docs/sphinx_setup/assets/versions_raw.js

diff --git a/docs/sphinx_setup/assets/versions_raw.js b/docs/sphinx_setup/assets/versions_raw.js
new file mode 100644
index 00000000000000..8045057450bf5f
--- /dev/null
+++ b/docs/sphinx_setup/assets/versions_raw.js
@@ -0,0 +1 @@
+var data='[{"version": "2024"}, {"version": "2023.3"}, {"version": "2022.3"}, {"version": "nightly"}, {"version": "archives"}]';
\ No newline at end of file

From 43df0b6fc1714779bad48736066e917a045de346 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Fri, 18 Oct 2024 05:51:24 -0700
Subject: [PATCH 20/32] TorchFX: GPTQ accuracy fix (#26294)

### Details:
- Fix for the accuracy issues discovered in Llama2 GPTQ with
aot_autograd

### Tickets:
 - [CVS-149032](https://jira.devtools.intel.com/browse/CVS-149032)

---------

Co-authored-by: Maxim Vafin <maxim.vafin@intel.com>
---
 .../workflows/job_pytorch_models_tests.yml    |  11 +
 .../torchfx_gptq_pattern_replacer.cpp         | 188 +++++++++++++-----
 .../models/gptq-torchfx-models-precommit      |   1 +
 .../test_gptq_torchfx_transformations.py      | 102 ++++++++++
 4 files changed, 250 insertions(+), 52 deletions(-)
 create mode 100644 tests/model_hub_tests/transformation_tests/models/gptq-torchfx-models-precommit
 create mode 100644 tests/model_hub_tests/transformation_tests/test_gptq_torchfx_transformations.py

diff --git a/.github/workflows/job_pytorch_models_tests.yml b/.github/workflows/job_pytorch_models_tests.yml
index 74915f1d9b823f..8f3699f6ab42a2 100644
--- a/.github/workflows/job_pytorch_models_tests.yml
+++ b/.github/workflows/job_pytorch_models_tests.yml
@@ -160,6 +160,17 @@ jobs:
           TEST_DEVICE: CPU
           USE_SYSTEM_CACHE: False
 
+      - name: TorchFX GPTQ Pattern Test
+        if: ${{ inputs.model_scope == 'precommit' }}
+        # install torch 2.3.1 as newer is not yet supported by openvino backend
+        run: |
+          export PYTHONPATH=${MODEL_HUB_TESTS_INSTALL_DIR}:$PYTHONPATH
+          python3 -m pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --upgrade --index-url https://download.pytorch.org/whl/cpu
+          python3 -m pytest ${MODEL_HUB_TESTS_INSTALL_DIR}/transformation_tests/test_gptq_torchfx_transformations.py -m precommit --html=${INSTALL_TEST_DIR}/TEST-torch_gptqpattern_tests.html --self-contained-html -v --tb=short
+        env:
+          TEST_DEVICE: CPU
+          USE_SYSTEM_CACHE: False
+
       - name: Reformat unsupported ops file
         if: ${{ inputs.model_scope != 'precommit' && !cancelled()}}
         run: |
diff --git a/src/frontends/pytorch/src/transforms/torchfx_gptq_pattern_replacer.cpp b/src/frontends/pytorch/src/transforms/torchfx_gptq_pattern_replacer.cpp
index a533739b16fea1..caeeb8c557b380 100644
--- a/src/frontends/pytorch/src/transforms/torchfx_gptq_pattern_replacer.cpp
+++ b/src/frontends/pytorch/src/transforms/torchfx_gptq_pattern_replacer.cpp
@@ -40,18 +40,6 @@ uint32_t read_u4_data(const void* array, size_t index) {
     return val;
 };
 
-void write_u4_data(void* array, size_t index, uint32_t data) {
-    auto arr_u32 = reinterpret_cast<uint32_t*>(array);
-    size_t idx_u32 = index / 8;
-    size_t offset_u32 = index % 8;
-    uint32_t old_val = arr_u32[idx_u32];
-    data = data << (offset_u32 * 4);
-    uint32_t mask = 15;
-    mask = ~(mask << (offset_u32 * 4));
-    uint32_t new_val = (old_val & mask) | data;
-    arr_u32[idx_u32] = new_val;
-};
-
 GPTQDecompressionReplacer::GPTQDecompressionReplacer() {
     const auto& const_1 = wrap_type<v0::Constant>();
     const auto& const_2 = wrap_type<v0::Constant>();
@@ -73,61 +61,157 @@ GPTQDecompressionReplacer::GPTQDecompressionReplacer() {
     const auto& convert_2 = wrap_type<v0::Convert>({const_6});
     const auto& bitwise_and = wrap_type<ov::op::v13::BitwiseAnd>({add_or_convert, convert_2});
 
-    ov::matcher_pass_callback callback = [unsqueeze_1](Matcher& m) {
+    ov::matcher_pass_callback callback = [=](Matcher& m) {
         auto bitwise_and = m.get_match_root();
         if (!bitwise_and) {
             return false;
         }
         const auto& pattern_map = m.get_pattern_value_map();
-        const auto& input_node = pattern_map.at(unsqueeze_1).get_node_shared_ptr();
-        auto weights_u32 = std::dynamic_pointer_cast<v0::Constant>(input_node->get_input_node_shared_ptr(0));
-        auto axis = std::dynamic_pointer_cast<v0::Constant>(input_node->get_input_node_shared_ptr(1));
-        auto axis_data = axis->get_data_ptr<uint32_t>();
-
-        auto u8_shape = weights_u32->get_shape();
-        auto src = weights_u32->get_data_ptr<uint32_t>();
-
-        ov::Shape u4_shape;
-        bool dim_added = false;
-        size_t stride = 1;
-        size_t size_y = 1;
-        for (size_t i = 0; i < u8_shape.size(); i++) {
-            if (axis_data[0] == i) {
-                u4_shape.push_back(8);
-                dim_added = true;
-            }
-            if (axis_data[0] <= i) {
-                stride *= u8_shape[i];
-            } else {
-                size_y *= u8_shape[i];
-            }
-            u4_shape.push_back(u8_shape[i]);
+        auto unsqueeze_1_node = pattern_map.at(unsqueeze_1).get_node_shared_ptr();
+        auto unsqueeze_1_in0_const =
+            std::dynamic_pointer_cast<v0::Constant>(unsqueeze_1_node->get_input_node_shared_ptr(0));
+        auto unsqueeze_1_in1_const =
+            std::dynamic_pointer_cast<v0::Constant>(unsqueeze_1_node->get_input_node_shared_ptr(1));
+        auto abs_node = pattern_map.at(abs).get_node_shared_ptr();
+        auto abs_in_const = std::dynamic_pointer_cast<v0::Constant>(abs_node->get_input_node_shared_ptr(0));
+        auto broadcast_node = pattern_map.at(broadcast).get_node_shared_ptr();
+        auto unsqueeze_2_node = pattern_map.at(unsqueeze_2).get_node_shared_ptr();
+        auto unsqueeze_2_in0_const =
+            std::dynamic_pointer_cast<v0::Constant>(unsqueeze_2_node->get_input_node_shared_ptr(0));
+        auto unsqueeze_2_in1_const =
+            std::dynamic_pointer_cast<v0::Constant>(unsqueeze_2_node->get_input_node_shared_ptr(1));
+
+        OutputVector outputs_1(unsqueeze_1_node->get_output_size());
+        OutputVector unsqueeze_1_inputs(2);
+        unsqueeze_1_inputs[0] = unsqueeze_1_in0_const->outputs()[0];
+        unsqueeze_1_inputs[1] = unsqueeze_1_in1_const->outputs()[0];
+        if (!unsqueeze_1_node->constant_fold(outputs_1, unsqueeze_1_inputs)) {
+            return false;
         }
-        if (!dim_added) {
-            u4_shape.push_back(8);
+
+        OutputVector outputs_2(abs_node->get_output_size());
+        if (!abs_node->constant_fold(outputs_2, abs_in_const->outputs())) {
+            return false;
         }
 
-        auto new_const = std::make_shared<v0::Constant>(element::u4, u4_shape);
-        auto dst = const_cast<uint32_t*>(reinterpret_cast<const uint32_t*>(new_const->get_data_ptr()));
+        OutputVector outputs_3(broadcast_node->get_output_size());
+        OutputVector broadcast_inputs(2);
+        broadcast_inputs[0] = outputs_1[0];
+        broadcast_inputs[1] = outputs_2[0];
+        if (!broadcast_node->constant_fold(outputs_3, broadcast_inputs)) {
+            return false;
+        }
+
+        OutputVector outputs_4(unsqueeze_2_node->get_output_size());
+        OutputVector unsqueeze_2_inputs(2);
+        unsqueeze_2_inputs[0] = unsqueeze_2_in0_const->outputs()[0];
+        unsqueeze_2_inputs[1] = unsqueeze_2_in1_const->outputs()[0];
+        if (!unsqueeze_2_node->constant_fold(outputs_4, unsqueeze_2_inputs)) {
+            return false;
+        }
+        const int32_t* rs_in0 =
+            std::dynamic_pointer_cast<v0::Constant>(outputs_3[0].get_node_shared_ptr())->get_data_ptr<int32_t>();
+        const int32_t* rs_in1 =
+            std::dynamic_pointer_cast<v0::Constant>(outputs_4[0].get_node_shared_ptr())->get_data_ptr<int32_t>();
+        auto shifted_const = std::make_shared<v0::Constant>(element::i32, outputs_3[0].get_shape());
+        auto dst = const_cast<int32_t*>(reinterpret_cast<const int32_t*>(shifted_const->get_data_ptr()));
         if (!dst)
             return false;
 
-        size_t in_idx = 0;
-        for (size_t y = 0; y < size_y; y++) {
-            size_t offset = y * stride * 8;
-            for (size_t x = 0; x < stride; x++) {
-                for (size_t z = 0; z < 8; z++) {
-                    uint32_t val = read_u4_data(src, in_idx);
-                    write_u4_data(dst, (offset + x + stride * z), val);
-                    in_idx++;
-                }
+        // TODO: Bitwise right shift operation below might need to be
+        // optimized to reduce FIL.
+        size_t rs_in0_shape_size = shape_size(outputs_3[0].get_shape());
+        const auto& rs_in0_shape = outputs_3[0].get_shape();
+        const auto& rs_in1_shape = outputs_4[0].get_shape();
+        int shift_dim = -1;
+        size_t shift_offset = 1;
+        for (size_t i = 0; i < rs_in1_shape.size(); ++i) {
+            size_t dim = rs_in1_shape[i];
+            if (dim != 1 && dim != rs_in0_shape[i]) {
+                return false;
+            }
+            if (shift_dim != -1) {
+                shift_offset *= rs_in0_shape[i];
+            }
+            if (dim == rs_in0_shape[i]) {
+                shift_dim = static_cast<int>(i);
+            }
+        }
+        if (shift_dim == -1)
+            return false;
+        for (size_t k = 0; k < rs_in0_shape_size; ++k) {
+            size_t shift_idx = (k / shift_offset) % rs_in1_shape[shift_dim];
+            int32_t shift_val = rs_in1[shift_idx];
+            dst[k] = (rs_in0[k] >> shift_val);
+        }
+
+        std::shared_ptr<ov::Node> convert_1_node = nullptr;
+        OutputVector outputs_7;
+        if (pattern_map.find(convert_1) != pattern_map.end()) {
+            convert_1_node = pattern_map.at(convert_1).get_node_shared_ptr();
+            outputs_7.resize(convert_1_node->get_output_size());
+            if (!convert_1_node->constant_fold(outputs_7, shifted_const->outputs())) {
+                return false;
+            }
+        } else {
+            auto convert_3_node = pattern_map.at(convert_3).get_node_shared_ptr();
+            auto convert_4_node = pattern_map.at(convert_4).get_node_shared_ptr();
+            auto convert_4_in_const =
+                std::dynamic_pointer_cast<v0::Constant>(convert_4_node->get_input_node_shared_ptr(0));
+            auto add_node = pattern_map.at(add).get_node_shared_ptr();
+            OutputVector outputs_5(convert_3_node->get_output_size());
+            if (!convert_3_node->constant_fold(outputs_5, shifted_const->outputs())) {
+                return false;
+            }
+            OutputVector outputs_6(convert_4_node->get_output_size());
+            if (!convert_4_node->constant_fold(outputs_6, convert_4_in_const->outputs())) {
+                return false;
+            }
+            outputs_7.resize(add_node->get_output_size());
+            OutputVector add_inputs(2);
+            add_inputs[0] = outputs_5[0];
+            add_inputs[1] = outputs_6[0];
+            if (!add_node->constant_fold(outputs_7, add_inputs)) {
+                return false;
             }
         }
 
-        copy_runtime_info_and_name(weights_u32, {new_const}, {weights_u32, bitwise_and});
+        auto convert_2_node = pattern_map.at(convert_2).get_node_shared_ptr();
+        auto convert_2_in_const = std::dynamic_pointer_cast<v0::Constant>(convert_2_node->get_input_node_shared_ptr(0));
+
+        OutputVector outputs_8(convert_2_node->get_output_size());
+        if (!convert_2_node->constant_fold(outputs_8, convert_2_in_const->outputs())) {
+            return false;
+        }
+
+        OutputVector outputs_9(bitwise_and->get_output_size());
+
+        const int8_t* and_in0 =
+            std::dynamic_pointer_cast<v0::Constant>(outputs_7[0].get_node_shared_ptr())->get_data_ptr<int8_t>();
+        const int8_t* and_in1 =
+            std::dynamic_pointer_cast<v0::Constant>(outputs_8[0].get_node_shared_ptr())->get_data_ptr<int8_t>();
+        auto masked_const = std::make_shared<v0::Constant>(element::i8, outputs_7[0].get_shape());
+        auto masked_dst = const_cast<int8_t*>(reinterpret_cast<const int8_t*>(masked_const->get_data_ptr()));
+        if (!masked_dst)
+            return false;
+
+        size_t and_in0_shape_size = shape_size(outputs_7[0].get_shape());
+        // TODO: Bitwise and operation below might need to be
+        // optimized to reduce FIL.
+        int8_t mask = and_in1[0];
+        for (size_t k = 0; k < and_in0_shape_size; ++k) {
+            masked_dst[k] = (and_in0[k] & mask);
+        }
+
+        auto convert_to_u4 = std::make_shared<v0::Convert>(masked_const, element::u4);
+        OutputVector outputs_10(convert_to_u4->get_output_size());
+        if (!convert_to_u4->constant_fold(outputs_10, masked_const->outputs())) {
+            return false;
+        }
 
-        auto new_convert = std::make_shared<v0::Convert>(new_const, bitwise_and->get_output_element_type(0));
-        copy_runtime_info_and_name(bitwise_and, {new_convert}, {input_node});
+        auto new_convert =
+            std::make_shared<v0::Convert>(outputs_10[0].get_node_shared_ptr(), bitwise_and->get_output_element_type(0));
+        copy_runtime_info_and_name(bitwise_and, {new_convert}, {unsqueeze_1_node});
         replace_node(bitwise_and, new_convert);
         return true;
     };
diff --git a/tests/model_hub_tests/transformation_tests/models/gptq-torchfx-models-precommit b/tests/model_hub_tests/transformation_tests/models/gptq-torchfx-models-precommit
new file mode 100644
index 00000000000000..b796dd2bf13b5a
--- /dev/null
+++ b/tests/model_hub_tests/transformation_tests/models/gptq-torchfx-models-precommit
@@ -0,0 +1 @@
+atorsvn/TinyLlama-1.1B-Chat-v0.3-gptq-4bit,https://huggingface.co/atorsvn/TinyLlama-1.1B-Chat-v0.3-gptq-4bit
diff --git a/tests/model_hub_tests/transformation_tests/test_gptq_torchfx_transformations.py b/tests/model_hub_tests/transformation_tests/test_gptq_torchfx_transformations.py
new file mode 100644
index 00000000000000..dc57c02285e448
--- /dev/null
+++ b/tests/model_hub_tests/transformation_tests/test_gptq_torchfx_transformations.py
@@ -0,0 +1,102 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
+import hashlib
+from openvino.frontend.pytorch.torchdynamo.execute import compiled_cache
+import models_hub_common.utils as utils
+import pytest
+import os
+
+def patch_gptq(config):
+    do_gptq_patching = False
+    config_dict = config.to_dict()
+    quantization_config = config_dict.get("quantization_config", None)
+    do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
+    orig_cuda_check = torch.cuda.is_available
+    orig_post_init_model = None
+    if do_gptq_patching:
+        torch.set_default_dtype(torch.float32)
+        torch.cuda.is_available = lambda: False
+
+        from optimum.gptq import GPTQQuantizer
+
+        orig_post_init_model = GPTQQuantizer.post_init_model
+
+        def post_init_model(self, model):
+            from auto_gptq import exllama_set_max_input_length
+
+            class StoreAttr(object):
+                pass
+
+            model.quantize_config = StoreAttr()
+            model.quantize_config.desc_act = self.desc_act
+            if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
+                model = exllama_set_max_input_length(model, self.max_input_length)
+            return model
+
+        GPTQQuantizer.post_init_model = post_init_model
+    return orig_cuda_check, orig_post_init_model
+
+def run_gptq_torchfx(tmp_path, model_id, model_link, prompt_result_pair):
+    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float32)
+    cuda, post_init = patch_gptq(config)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float32)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        trust_remote_code=True,
+        config=config,
+        device_map='cpu',
+        torch_dtype=torch.float32
+    )
+    
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        max_new_tokens=4,
+        do_sample=True,
+        temperature=0.01,
+        top_p=0.01,
+        top_k=1,
+        repetition_penalty=1.1,
+        num_beams=1,
+    )
+
+    prompt = prompt_result_pair["prompt"]
+    expected_md5 = prompt_result_pair["result_md5"]
+    
+    model.model.forward = torch.compile(model.model.forward, backend="openvino", dynamic=True, fullgraph=True, options={'aot_autograd': True})
+    
+    result_ov = pipe(prompt)
+    md5_ov = hashlib.new("md5", result_ov[0]['generated_text'].encode(), usedforsecurity=False).hexdigest()
+    
+    u4_ops = ["FullyConnected",]
+    num_u4_ops = 0
+    num_u4_ops_supported = 0
+    for pid in compiled_cache:
+        for op in compiled_cache[pid].get_runtime_model().get_ordered_ops():
+            if (str(op.get_rt_info()["layerType"].get()) in u4_ops):
+                u4_exec = (str(op.get_rt_info()["runtimePrecision"].get()) == "u4")
+                if u4_exec:
+                    num_u4_ops_supported += 1
+                num_u4_ops += 1
+    
+    assert(expected_md5 == md5_ov), "Output does not match with the expected output"
+    assert((num_u4_ops > 0) and (num_u4_ops == num_u4_ops_supported)), "Runtime precision is not u4"
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("model_name, model_link, mark, reason", utils.get_models_list(os.path.join(os.path.dirname(__file__), "models", "gptq-torchfx-models-precommit")))
+@pytest.mark.parametrize('prompt_result_pair', ([
+    {"prompt" : "Tell me about AI", "result_md5" : "4385ccbce14627ae91f846b4c8a3f145"},
+]))
+def test_gptq_torchfx_precommit(tmp_path, model_name, model_link, mark, reason, prompt_result_pair, ie_device):
+    assert mark is None or mark == 'skip' or mark == 'xfail', \
+        "Incorrect test case: {}, {}".format(model_name, model_link)
+    if mark == 'skip':
+        pytest.skip(reason)
+    elif mark == 'xfail':
+        pytest.xfail(reason)
+    run_gptq_torchfx(tmp_path, model_name, model_link, prompt_result_pair)
+

From 56fe26f6fb3c0617d39fd96f666231088cb8f5dc Mon Sep 17 00:00:00 2001
From: Maksim Doronin <maksim.doronin@intel.com>
Date: Fri, 18 Oct 2024 14:29:02 +0100
Subject: [PATCH 21/32] Introduce protopipe (#27087)

### Details:
 - Publishing protopipe to open-source

### Tickets:
 - E-143100
---
 .gitmodules                                   |   3 +
 scripts/CMakeLists.txt                        |   1 +
 src/plugins/intel_npu/cmake/features.cmake    |   2 +
 .../intel_npu/thirdparty/CMakeLists.txt       |  12 +
 src/plugins/intel_npu/thirdparty/yaml-cpp     |   1 +
 src/plugins/intel_npu/tools/CMakeLists.txt    |   4 +
 .../intel_npu/tools/protopipe/CMakeLists.txt  |  72 ++
 .../intel_npu/tools/protopipe/README.md       | 608 ++++++++++++
 .../tools/protopipe/cmake/standalone.cmake    |  63 ++
 .../intel_npu/tools/protopipe/main.cpp        | 266 ++++++
 .../intel_npu/tools/protopipe/src/graph.cpp   | 140 +++
 .../intel_npu/tools/protopipe/src/graph.hpp   | 168 ++++
 .../tools/protopipe/src/parser/config.cpp     | 872 ++++++++++++++++++
 .../tools/protopipe/src/parser/config.hpp     |  12 +
 .../tools/protopipe/src/parser/parser.cpp     |  20 +
 .../tools/protopipe/src/parser/parser.hpp     |  61 ++
 .../intel_npu/tools/protopipe/src/result.cpp  |  22 +
 .../intel_npu/tools/protopipe/src/result.hpp  |  30 +
 .../src/scenario/accuracy_metrics.cpp         | 121 +++
 .../src/scenario/accuracy_metrics.hpp         |  52 ++
 .../protopipe/src/scenario/criterion.cpp      |  72 ++
 .../protopipe/src/scenario/criterion.hpp      |  58 ++
 .../protopipe/src/scenario/inference.cpp      |  17 +
 .../protopipe/src/scenario/inference.hpp      | 111 +++
 .../protopipe/src/scenario/scenario_graph.cpp |  40 +
 .../protopipe/src/scenario/scenario_graph.hpp | 102 ++
 .../protopipe/src/simulation/computation.cpp  |  42 +
 .../protopipe/src/simulation/computation.hpp  |  36 +
 .../src/simulation/computation_builder.cpp    | 462 ++++++++++
 .../src/simulation/computation_builder.hpp    |  74 ++
 .../protopipe/src/simulation/dummy_source.cpp |  89 ++
 .../protopipe/src/simulation/dummy_source.hpp |  37 +
 .../protopipe/src/simulation/executor.cpp     |  66 ++
 .../protopipe/src/simulation/executor.hpp     |  42 +
 .../protopipe/src/simulation/layers_data.cpp  | 155 ++++
 .../protopipe/src/simulation/layers_data.hpp  |  57 ++
 .../src/simulation/layers_reader.cpp          |  46 +
 .../src/simulation/layers_reader.hpp          |  27 +
 .../protopipe/src/simulation/operations.cpp   | 131 +++
 .../protopipe/src/simulation/operations.hpp   |  77 ++
 .../src/simulation/ov_layers_reader.cpp       | 215 +++++
 .../src/simulation/performance_mode.cpp       | 337 +++++++
 .../src/simulation/performance_mode.hpp       |  41 +
 .../src/simulation/reference_mode.cpp         | 361 ++++++++
 .../src/simulation/reference_mode.hpp         |  35 +
 .../protopipe/src/simulation/simulation.cpp   | 131 +++
 .../protopipe/src/simulation/simulation.hpp   |  57 ++
 .../src/simulation/validation_mode.cpp        | 363 ++++++++
 .../src/simulation/validation_mode.hpp        |  34 +
 .../protopipe/src/utils/data_providers.cpp    |  64 ++
 .../protopipe/src/utils/data_providers.hpp    |  70 ++
 .../tools/protopipe/src/utils/error.hpp       |  39 +
 .../tools/protopipe/src/utils/logger.cpp      |  32 +
 .../tools/protopipe/src/utils/logger.hpp      |  29 +
 .../tools/protopipe/src/utils/timer.cpp       |  73 ++
 .../tools/protopipe/src/utils/timer.hpp       |  25 +
 .../tools/protopipe/src/utils/utils.cpp       |  84 ++
 .../tools/protopipe/src/utils/utils.hpp       |  65 ++
 .../tools/single-image-test/CMakeLists.txt    |   2 +-
 59 files changed, 6327 insertions(+), 1 deletion(-)
 create mode 160000 src/plugins/intel_npu/thirdparty/yaml-cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/CMakeLists.txt
 create mode 100644 src/plugins/intel_npu/tools/protopipe/README.md
 create mode 100644 src/plugins/intel_npu/tools/protopipe/cmake/standalone.cmake
 create mode 100644 src/plugins/intel_npu/tools/protopipe/main.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/graph.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/graph.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/parser/config.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/parser/parser.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/parser/parser.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/result.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/result.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/scenario/accuracy_metrics.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/scenario/accuracy_metrics.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/scenario/criterion.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/scenario/criterion.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/scenario/inference.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/scenario/scenario_graph.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/scenario/scenario_graph.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/computation.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/computation.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/computation_builder.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/computation_builder.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/dummy_source.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/dummy_source.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/executor.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/executor.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/layers_data.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/layers_data.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/layers_reader.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/layers_reader.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/operations.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/operations.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/ov_layers_reader.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/performance_mode.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/performance_mode.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/reference_mode.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/reference_mode.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/validation_mode.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/simulation/validation_mode.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/utils/data_providers.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/utils/data_providers.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/utils/error.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/utils/logger.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/utils/logger.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/utils/timer.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/utils/timer.hpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/utils/utils.cpp
 create mode 100644 src/plugins/intel_npu/tools/protopipe/src/utils/utils.hpp

diff --git a/.gitmodules b/.gitmodules
index a9cad1dee5f494..5feb7458da1801 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -78,6 +78,9 @@
 [submodule "src/plugins/intel_npu/thirdparty/level-zero-ext"]
 	path = src/plugins/intel_npu/thirdparty/level-zero-ext
 	url = https://github.com/intel/level-zero-npu-extensions.git
+[submodule "src/plugins/intel_npu/thirdparty/yaml-cpp"]
+	path = src/plugins/intel_npu/thirdparty/yaml-cpp
+	url = https://github.com/jbeder/yaml-cpp.git
 [submodule "thirdparty/telemetry"]
 	path = thirdparty/telemetry
 	url = https://github.com/openvinotoolkit/telemetry.git
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index 73cdd57e508bdb..69ad9f460e357a 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -12,6 +12,7 @@ set(shellcheck_skip_list
     "${OpenVINO_SOURCE_DIR}/thirdparty"
     "${OpenVINO_SOURCE_DIR}/src/plugins/intel_cpu/thirdparty"
     "${OpenVINO_SOURCE_DIR}/src/plugins/intel_gpu/thirdparty"
+    "${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/thirdparty"
     "${OpenVINO_SOURCE_DIR}/src/bindings/python/thirdparty/pybind11"
     "${TEMP}")
 
diff --git a/src/plugins/intel_npu/cmake/features.cmake b/src/plugins/intel_npu/cmake/features.cmake
index 07efefd4452403..8a9dce04f071b9 100644
--- a/src/plugins/intel_npu/cmake/features.cmake
+++ b/src/plugins/intel_npu/cmake/features.cmake
@@ -20,3 +20,5 @@ if(NOT BUILD_SHARED_LIBS AND NOT ENABLE_MLIR_COMPILER AND NOT ENABLE_DRIVER_COMP
 endif()
 
 ov_dependent_option(ENABLE_IMD_BACKEND "Enable InferenceManagerDemo based NPU AL backend" OFF "NOT WIN32;NOT CMAKE_CROSSCOMPILING" OFF)
+
+ov_dependent_option(ENABLE_INTEL_NPU_PROTOPIPE "Enable Intel NPU Protopipe tool" ON "ENABLE_INTEL_NPU_INTERNAL" OFF)
diff --git a/src/plugins/intel_npu/thirdparty/CMakeLists.txt b/src/plugins/intel_npu/thirdparty/CMakeLists.txt
index 4d0c66beeb7520..b064b5c7b9acd5 100644
--- a/src/plugins/intel_npu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_npu/thirdparty/CMakeLists.txt
@@ -12,3 +12,15 @@ if(ENABLE_ZEROAPI_BACKEND)
     add_library(LevelZero::NPUExt ALIAS level-zero-ext)
     install(TARGETS level-zero-ext EXPORT "${PROJECT_NAME}Targets")
 endif()
+
+#
+# yaml-cpp
+#
+
+if(ENABLE_INTEL_NPU_PROTOPIPE)
+    add_subdirectory(yaml-cpp EXCLUDE_FROM_ALL)
+    # NB: Suppress warnings in yaml-cpp
+    if(SUGGEST_OVERRIDE_SUPPORTED)
+        target_compile_options(yaml-cpp PRIVATE -Wno-suggest-override)
+    endif()
+endif()
diff --git a/src/plugins/intel_npu/thirdparty/yaml-cpp b/src/plugins/intel_npu/thirdparty/yaml-cpp
new file mode 160000
index 00000000000000..da82fd982c260e
--- /dev/null
+++ b/src/plugins/intel_npu/thirdparty/yaml-cpp
@@ -0,0 +1 @@
+Subproject commit da82fd982c260e7f335ce5acbceff24b270544d1
diff --git a/src/plugins/intel_npu/tools/CMakeLists.txt b/src/plugins/intel_npu/tools/CMakeLists.txt
index c0e620981952e1..ac1a51f74519c8 100644
--- a/src/plugins/intel_npu/tools/CMakeLists.txt
+++ b/src/plugins/intel_npu/tools/CMakeLists.txt
@@ -6,3 +6,7 @@
 add_subdirectory(common)
 add_subdirectory(compile_tool)
 add_subdirectory(single-image-test)
+
+if (ENABLE_INTEL_NPU_PROTOPIPE)
+    add_subdirectory(protopipe)
+endif()
diff --git a/src/plugins/intel_npu/tools/protopipe/CMakeLists.txt b/src/plugins/intel_npu/tools/protopipe/CMakeLists.txt
new file mode 100644
index 00000000000000..9ba76d89ca8445
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/CMakeLists.txt
@@ -0,0 +1,72 @@
+#
+# Copyright (C) 2023-2024 Intel Corporation.
+# SPDX-License-Identifier: Apache 2.0
+#
+
+set(TARGET_NAME protopipe)
+
+if (NOT DEFINED PROJECT_NAME)
+    cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
+    project(protopipe_standalone)
+    include("cmake/standalone.cmake")
+    return()
+endif()
+
+#
+# Dependencies
+#
+
+find_package(OpenCV QUIET COMPONENTS gapi)
+if(OpenCV_VERSION VERSION_LESS 4.9)
+    message(STATUS "NPU ${TARGET_NAME} tool is disabled due to missing dependencies: gapi from OpenCV >= 4.9.")
+    return()
+endif()
+
+if (WIN32)
+    # WA: add_tool_target expects to have all dependencies as cmake targets.
+    add_library(winmm INTERFACE)
+    target_link_libraries(winmm INTERFACE "winmm.lib")
+endif()
+
+#
+# Define the target
+#
+
+set(PROTOPIPE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+
+ov_add_target(ADD_CPPLINT
+              TYPE EXECUTABLE
+              NAME ${TARGET_NAME}
+              ROOT ${CMAKE_CURRENT_SOURCE_DIR}
+              ADDITIONAL_SOURCE_DIRS ${PROTOPIPE_SOURCE_DIR}
+              INCLUDES ${PROTOPIPE_SOURCE_DIR}
+              LINK_LIBRARIES
+                  PRIVATE
+                      Threads::Threads
+                      gflags
+                      yaml-cpp
+                      openvino::runtime
+                      opencv_gapi
+                      winmm)
+
+
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+                          FOLDER ${CMAKE_CURRENT_SOURCE_DIR}
+                          CXX_STANDARD 17)
+
+#
+# Install
+#
+
+install(TARGETS ${TARGET_NAME}
+        RUNTIME DESTINATION "tools/${TARGET_NAME}"
+        COMPONENT ${NPU_INTERNAL_COMPONENT}
+        ${OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL})
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
+    install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/README.md"
+            DESTINATION "tools/${TARGET_NAME}"
+            COMPONENT ${NPU_INTERNAL_COMPONENT}
+            ${OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL})
+endif()
diff --git a/src/plugins/intel_npu/tools/protopipe/README.md b/src/plugins/intel_npu/tools/protopipe/README.md
new file mode 100644
index 00000000000000..afe6e8cffbc8c3
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/README.md
@@ -0,0 +1,608 @@
+# Protopipe
+Protopipe is the C++ tool for simulating performance and validating accuracy of the various AI scenarios.
+
+Protopipe is built atop of [OpenCV G-API](https://github.com/opencv/opencv/wiki/Graph-API) and supports running inference through the [OpenVINO](https://github.com/openvinotoolkit/openvino) and [ONNXRuntime](https://github.com/microsoft/onnxruntime) frameworks.
+
+## Table of Contents
+* [Quick start](#quick-start)
+* [How to configure](#how-to-configure)
+	* [Global parameters](#global-parameters)
+	* [Model parameters](#model-parameters)
+    * [Graph structure](#graph-structure)
+        * [Dependency Graph](#dependency-graph)
+        * [Network sequence](#network-sequence)
+    * [Scenario parameters](#scenario-parameters)
+    * [Config example](#config-example)
+* [How to run](#how-to-run)
+* [Use cases](#use-cases)
+	* [Measure Performance](#measure-performance)
+	* [Generate Reference](#generate-reference)
+	* [Validate Accuracy](#validate-accuracy)
+* [How to build](#how-to-build)
+
+## Quick start
+Consider the following [Config example](#config-example) to start using Protopipe.
+
+Learn more about available config parameters (see: [How to configure](#how-to-configure)) and explore different execution modes (see: [Use-cases](#use-cases)) for more advanced usage.
+
+## How to configure
+Protopipe uses **YAML** format file to describe the AI scenario structure and its parameters
+
+### Global parameters
+The **YAML** config starts with specifying the several global parameters:
+- `model_dir` - **Optional**. Path to the models location. (**Default**: ".")
+- `blob_dir` - **Optional**. Path to the models location. (**Default**: ".")
+- `device_name` - **Optional**. OpenVINO device name: _CPU_, _GPU_, etc. (**Default**: _NPU_)
+- `compiler_type` - **Optional**. NPU compiler type: _DRIVER_, _MLIR_. (**Default**: _DRIVER_)
+- `log_level` - **Optional**. Log level: _NONE_, _INFO_, _DEBUG_. (**Default**: _NONE_)
+- `disable_high_resolution_waitable_timer` - **Optional**. Disables high resolution timer used to perform delays on Windows. (**Default**: false)
+
+Example:
+```
+model_dir:
+  local: C:\workspace\models
+device_name: NPU
+compiler_type: MLIR
+log_level: INFO
+```
+### Model parameters
+#### Common parameters
+- `name` or `path` - **Required**. Path to the model file.
+- `framework` - **Optional**. Framework to use for inference: *onnxrt*, *openvino*. (**Default**: *openvino*)
+- `input_data`, `output_data`, `metric`, `random` - **Optional**. Follow [Use-cases](#use-cases) to  learn the details.
+#### OpenVINO parameters
+- `priority` - **Optional**. Model priority: _HIGH_, _MEDIUM_, _LOW_. (Default: _MEDIUM_)
+- `config` - **Optional**. OpenVINO Plugin specific parameters.
+- `device` - **Optional**. OpenVINO device name. 
+- `ip` - **Optional**. Input layer precision: _FP16_, _FP32_, _U8_, _I32_.
+- `op` - **Optional**. Output layer precision: _FP16_, _FP32_, _U8_, _I32_.
+- `il` - **Optional**. Input layer layout.
+- `ol` - **Optional**. Output layer layout.
+- `iml` - **Optional**. Input model layout.
+- `oml` - **Optional**. Output model layout.
+
+Examples:
+```
+- { name: model.xml, ip: FP16, iml: NHWC, il: NCHW }
+- { name: model.xml, ip: { data: FP16 }, priority: HIGH }
+- { name: model.xml, device: NPU, config: { PERFORMANCE_HINT: THROUGHPUT } }
+```
+#### ONNXRT parameters
+- `ep` - **Optional**. Specifies the parameters for particular execution provider.
+- `session_options` - **Optional**. Set various session options for the ONNX Runtime.
+
+##### Supported Execution Providers
+- [OpenVINO Execution Provider](https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html)
+  - `name: OV` - **Required**. Enables OpenVINO Execution Provider.
+  - `device_type` - **Optional**.The device type: _NPU_U8_, _CPU_FP32_, etc.
+  - `params` - **Optional**. Accepts a map of options and their corresponding values that can be passed to OV EP.
+
+**Note**: If none of the supported execution providers are specified, the default `MLAS` will be used.
+
+Examples:
+```
+- { name: model.onnx, framework: onnxrt } # Default (MLAS) EP will be used
+- { name: model.onnx, framework: onnxrt, session_options: { session.disable_cpu_ep_fallback: 1 } } # Default (MLAS) EP with the sessions options will be used
+- { name: model.onnx, framework: onnxrt, ep: { name: OV, device_type: NPU_U8, params: { enable_qdq_optimizer: False, model_priority: LOW } } } # OpenVINO EP will be used
+```
+
+### Graph structure
+There are two ways to describe the execution graph structure in Protopipe:  
+1. Using [Dependency Graph](#dependency-graph) (preferable)
+2. Using [Network Sequence](#network-sequence) (old)
+
+#### Dependency Graph
+The dependency graph in Protopipe is specified by:
+- `op_desc` - The list of operations, every operation has the following parameters:
+  - `tag` - **Required**. The unique name of operation.
+  - `type` - **Optional**. The operation type: _Infer_, _CPU_, _Compound_ (**Default**: _Infer_)
+  - `repeat_count` - **Optional**. Runs operation over specified number of iterations.
+- `connections` - The list of connections between operations.
+
+Supported operation types
+1. `Infer` - Performs model inference. Follow [Model parameters](#model-parameters) for the details.
+2. `CPU` - Simulates CPU load by performing the busy wait during `time_in_us` amount of time in microseconds
+3. `Compound` - Defines a subgraphs that consists of `Infer` and `CPU` node types
+
+```
+op_desc:
+  - { tag: A, path: Model-A.xml, ip: FP16, op: FP16 }
+  - { tag: B, path: Model-B.onnx, framework: onnxrt, ep: { name: OV, device_type: CPU_FP32 } }
+  - { tag: C, type: CPU, time_in_us: 5000 }
+  - { tag: D, path: Model-D.onnx, framework: onnxrt }
+  - { tag: E, path: Model-E.xml, il: NCHW, device: NPU, config: { PEFORMANCE_HINT: LATENCY } }
+  - { tag: F, path: Model-F.xml }
+connections:
+  - [A, C, E, F]
+  - [A, B, D, F]
+  - [B, F]
+```
+```mermaid
+	graph LR;
+		A-->B
+		A-->C
+		B-->D
+		B-->F
+		C-->E
+		E-->F
+		D-->F
+```
+
+The source **is not** reflected in graph structure, assume that all operations that don't have input connections are implicitly linked with the source, e.g for the graph above:
+```mermaid
+	graph LR;
+		Source-->A
+		A-->B
+		A-->C
+		B-->D
+		B-->F
+		C-->E
+		E-->F
+		D-->F
+```
+<ins>**Note:**</ins> The situation when all nodes don't have input connections is also possible, consider:
+```
+op_desc:
+  - { tag: A, path: Model-A.xml }
+  - { tag: B, path: Model-B.xml }
+  - { tag: C, path: Model-C.xml }
+```
+
+```mermaid
+	graph LR;
+		Source-->A
+		Source-->B
+		Source-->C
+```
+In this case the section `connections` **can be omitted**.
+
+<ins>**Note:**</ins> Graph must remain `DAG`, so any loops in graph are prohibited including the self-loops as well as double edges. These are examples of incorrect graphs:
+```
+#1: Invalid - The list must contain at least two operations to connect
+- [A]
+#2: Invalid - Self-loop is prohibited 
+- [A, A]
+#3: Invalid - Loop is prohibited
+- [A, B, C, A]
+#4: Invalid - Double edge [B->C] is prohibited
+- [A, B, C]
+- [B, C]
+```
+**Example of repeat_count usage**
+```
+- op_desc:
+  - { tag: A, path: Model_A.xml, ... }
+  - { tag: B, path: Model_B.xml, repeat_count: 20 }
+  - { tag: C, path: Model_C.xml, ... }
+  connections:
+    - [A, B, C]
+```
+This defines the following pipeline:
+```mermaid
+graph LR;
+    A-->B
+    B-->C
+    B--->|20 iterations|B
+
+```
+**Example of "Compound" type operation**.
+```
+op_desc:
+  - { tag: A, path: Model-A.xml }
+  - tag: B,
+    type: Compound,
+    repeat_count: 10,
+    op_desc:
+      - { tag: D, path: Model-D.xml }
+      - { tag: E, path: Model-E.xml }
+      - { tag: F, path: Model-F.xml }
+    connections:
+      - [D, E]
+      - [D, F]
+  - { tag: C, path: Model-C.xml }
+connections:
+  - [A, B, C]
+```
+This defines the following pipeline:
+```mermaid
+graph LR;
+  A[Model-A.xml]
+  C[Model-C.xml]
+
+  subgraph B[Repeats 10 iterations]
+    direction LR
+    D[Model-D.xml]
+    E[Model-E.xml]
+    F[Model-F.xml]
+    
+    D --> E
+    D --> F
+
+  end
+  
+  A --> B
+  B --> C
+```
+
+#### Network Sequence
+There is also a way to describe the graph by using chain-like structure:  
+`network` - **Required**. List or list of lists of model parameters. Follow [Model Parameters](#model-parameters) for the details.  
+`delay_in_us` - **Optional**. Delay between models in microseconds.  
+
+```
+input_stream_list:
+- network:
+  - { name: A.xml, ip: FP16, il: NCHW, device: CPU }
+  - [{ name: B.xml, ip: FP16, op: FP16 }, { name: C.xml, ip: FP16, op: FP16 }]
+  - { name: D.xml, ip: FP16, op: FP16, config: { PEROFMRANCE_HINT: LATENCY } }
+  delay_in_us: 5000 
+```
+
+```mermaid
+	graph LR;
+	A-->Delay1;
+	Delay1-->B;
+    Delay1-->C;
+	B-->Delay2;
+	C-->Delay2;
+    Delay2-->D
+```
+
+### Scenario parameters
+The list of scenarios are specified by using `multi_inference` parameter, every scenario has the following parameters:
+- `name` - **Optional**. The name of execution scenario.
+- `input_stream_list` - **Required**. The list of the streams that will be run in parallel.  
+
+Every stream has the following execution parameters:
+- `name` - **Optional**. The name of the stream.  
+- `iteration_count` - **Optional**. Number of iterations to execute.  
+- `exec_time_in_secs` - **Optional**. Execute until timeout specified.  
+- `frames_interval_in_ms` - **Optional**. Execution frequency of the stream (**Default**: 0 - Unbounded)  
+- `target_fps` - **Optional**. Execution frequency of the stream. `target_fps = 1000 / frames_interval_in_ms`. `target_fps` and `frames_interval_in_ms` are mutually exclusive and cannot be provided together.
+- `target_latency_in_ms` - **Optional**. When iteration isn't finished within specified interval, the next frame will be dropped from execution. (**Default**: Disabled)
+- `op_desc`/`conections` or `network` - **Required**. Execution graph structure. Follow [Graph structure](#graph-structure) for the details.
+
+### Config example
+Consider the following scenario that consists of two parallel streams specified on `config.yaml`:  
+```
+model_dir:
+  local: C:\workspace\models
+device_name: NPU
+compiler_type: MLIR
+log_level: INFO
+
+multi_inference:
+- input_stream_list:
+  - network:
+    - { name: A.xml, ip: FP16, il: NCHW, device: CPU }
+    - [{ name: B.xml, ip: FP16, op: FP16 }, { name: C.xml, ip: FP16, op: FP16 }]
+    - { name: D.xml, ip: FP16, op: FP16, config: { PEROFMRANCE_HINT: LATENCY } }
+    target_fps: 30
+    exec_time_in_secs: 15
+  - op_desc:
+    - { tag: E, path: E.onnx, framework: onnxrt, ep: { name: OV, device_type: NPU_U8 } }
+    - { tag: F, type: CPU, time_in_us: 5000 }
+    - { tag: G, path: G.xml, ip: FP16, op: FP16, priority: HIGH }
+    connections:
+    - [E, F, G]
+    target_fps: 100
+    exec_time_in_secs: 15
+```
+- The first `stream` is defined by using [Network sequence](#network-sequence) syntax and will execute the following graph with `30` FPS cadence:
+	```mermaid
+	graph LR;
+	A-->B;
+	A-->C;
+	B-->D;
+	C-->D;
+	```
+- The second `stream` is defined by using [Dependency graph](#dependency-graph) syntax and will execute the following graph with `100` FPS cadence.
+	```mermaid
+	graph LR;
+	E-->F;
+	F-->G;
+	```
+ 
+Run:
+```
+./protopipe -cfg config.yaml --drop_frames
+```
+Both streams will be executed simultaneously in different threads during `15` seconds.
+
+Output format:
+```
+stream 0: throughput: <number> FPS, latency: min: <number> ms, avg: <number> ms, max: <number> ms, frames dropped: <number>/<number>
+stream 1: throughput: <number> FPS, latency: min: <number> ms, avg: <number> ms, max: <number> ms, frames dropped: <number>/<number>
+```
+
+## How to run
+Protopipe has the following `CLI` options to configure the execution behaviour:  
+
+`--cfg <path>` - Path to configuration file.       
+`--drop_frames`- **Optional**. Drop frames if they come earlier than stream is completed. E.g if `stream` works with `target_fps: 10` (~`100ms` latency) but stream iteration takes `150ms` - the next iteration will be triggered only in `50ms` if option is enabled.           
+`--pipeline` - **Optional**. Enables pipelined execution for all scenarios/streams.                      
+`--niter <value>` - **Optional**. Number of iterations. If specified overwrites termination criterion specified in configuration file for all scenarios/streams.             
+`-t <value>` - **Optional**. Time in seconds. If specified overwrites termination criterion specified in configuration file for all scenarios/streams.  
+`--mode <value>` - **Optional**. Execution mode: *performance*, *reference*, *validation* (**Default**: *performance*)  
+`--exec_filter <value>` - **Optional**. Run only the scenarios that match provided string pattern.  
+`--inference_only` - **Optional**. Run only inference execution for every model excluding i/o data transfer (**Default**: true)  
+
+### Filtering
+Sometime it's needed to run particular set of scenarios specified in config file rather than all of them.   
+For example consider the following config file with three scenarios specified in `scenarios.yaml`:
+```
+model_dir:
+  local: /models/
+device_name: CPU
+multi_inference:
+- input_stream_list:
+  - network:
+    - { name: A.xml }
+- input_stream_list:
+  - network:
+    - { name: B.xml }
+- input_stream_list:
+  - network:
+    - { name: C.xml }
+```
+By default all scenarios are assigned unique names according to the following `multi_inference_<number>` pattern.    
+E.g scenario with model `A.xml` has default name `multi_inference_0`.    
+Use `-exec_filter <value>` CLI option to control what scenarios from config should be executed:   
+```
+./protopipe -cfg scenarios.yaml -niter 100 -exec_filter=".*[0-1]"
+```
+Only `multi_inference_0` and `multi_inference_1` scenarios will be executed.  
+
+It's also possible to overwrite the default names in config file:
+```
+model_dir:
+  local: /models/
+device_name: CPU
+multi_inference:
+- name: Model-A-Scenario
+  input_stream_list:
+  - network:
+    - { name: A.xml }
+- name: Model-B-Scenario
+  input_stream_list:
+  - network:
+    - { name: B.xml }
+- name: Model-C-Scenario
+  input_stream_list:
+  - network:
+    - { name: C.xml }
+```
+and use them for filtering:
+```
+./protopipe --cfg scenarios.yaml --niter 100 --exec_filter ".*-[AB].*"
+```
+Only `Model-A-Scenario` and `Model-B-Scenario` scenarios will be executed.
+
+**Note**: Protopipe uses [std::regex](https://en.cppreference.com/w/cpp/regex) rules for pattern matching.
+
+## Use cases
+Once scenario configuration is defined (see: [How to configure](#how-to-configure)) it can be used for various uses cases.
+### Measure performance
+`Protopipe` can report the performance statistics, consider the following run example:
+```
+./protopipe --cfg config.yaml --drop_frames -t 30
+```
+Example of output:
+```
+stream 0: throughput: 7.62659 FPS, latency: min: 93.804 ms, avg: 111.31 ms, max: 145.178 ms, frames dropped: 290/390
+```
+It might be also interesting to play with the following `CLI` options:
+- `--drop_frames=false` - Disables frame drop. By default, if iteration doesn't fit into 1000 / `target_fps` latency interval, the next iteration will be skipped.
+- `--inference_only=false` - Enables i/o data transfer for inference. By default only inference time is captured in performance statistics.
+- `--pipeline` - Enables ***pipelined*** execution.
+
+### Generate reference
+As the prerequisite for accuracy validation it's useful to have a mechanism that provides an opportunity to generate the reference output data to compare with. In Protopipe in can be done by using the `reference` mode.
+Use additional parameters to configure `reference` mode:
+- `input_data` - **Required**. Path that contain input data for the model, if entity under the path is empty, input data will be generated randomly and dumped into the path specified.
+- `output_data` - **Required**. Path where to dump reference output data.
+- `random` - **Optional**. Initializer to generate input data randomly. (Default: ` { dist: uniform, low: 0.0, high: 255 }`)
+
+Examples:
+```
+random: { dist: uniform, low: -1.0, high: 1.0 } # specified globally for all models
+multi_inference:
+- input_stream_list:
+  - network:
+    - { name: A.xml, ip: FP16, input_data: A-inputs/, output_data: B-inputs/ }
+      # overwrites global initializer for the model B.xml
+    - { name: B.xml, ip: FP16, input_data: B-inputs/, output_data: B-outptus/, random: { name: uniform, low: 0, high: 255.0 }
+```
+
+Run `Protopipe` in `reference` mode:
+```
+./protopipe -cfg config.yaml -mode reference -niter 10
+```
+Output:
+```
+stream 0: Reference data has been generated for 10 iteration(s)
+```
+
+### Validate accuracy
+Protopipe has the dedicated `validation` mode to perform accuracy validation. Existing configuration file can be simply extended to perform accuracy validation:
+
+- `save_validation_outputs` - **Optional**. Accepts the path where to dump actual execution outputs. (Default: disabled)
+- `metric` - **Optional**. Accuracy metric to compare actual vs reference outputs. (Default: `{ name: norm, tolerance: 0.0 }`)
+- `input_data` - **Required**. Path that contain input data for the model.
+- `output_data` - **Required**. Path that contain **reference** data to compare with.  
+
+**Note**: If folder is provided either for **input_data** or **output_data**, it must be in the following format:
+```
+input_data/
+  <input_layer_name>/
+    input_0.bin
+	input_1.bin
+	...
+	input_N.bin
+
+output_data/
+  <output_layer_name>/
+    output_0.bin
+	output_1.bin
+	...
+	output_N.bin
+```
+**Note**: input and output data can be generated automatically by using `Protopipe` in **reference** mode. (see: [Generate reference](#generate-reference))
+
+Examples:
+```
+- { name: model.xml, ip: FP16, input_data: input_data/, output_data: output_data/ }
+- { name: model.xml, ip: FP16, input_data: input.bin, output_data: output.bin }
+- { name: model.xml, ip: FP16, input_data: { data: input.bin }, output_data: { result: output.bin} }
+```
+
+### Supported metrics
+1. L2 Norm: $$\text{Norm}(\mathbf{A}, \mathbf{B}) = \sqrt{\sum_{i,j} (A_{i,j} - B_{i,j})^2}$$  
+Parameters:
+    - `name: norm` - **Required**. Enables L2 Norm metric.
+	- `tolerance` - **Required**. If value of metric is greater than **tolerance** it will be treated as **FAIL**.
+3. Cosine similarity: $$\text{Cosine}(\mathbf{A}, \mathbf{B}) = \frac{\mathbf{A} \cdot \mathbf{B}}{\| \mathbf{A} \|_2 \| \mathbf{B} \|_2}$$  
+Parameters:
+    - `name: cosine` - **Required**. Enables cosine similarity metric.
+	- `threshold` - **Required**. If value of metric is lower than **threshold** it will be treated as **FAIL**.
+3. NRMSE : $$\text{NRMSE}(\mathbf{A}, \mathbf{B}) = \frac{1}{D}\sqrt{\frac{1}{N}\sum_{i=1}^N(A_i - B_i)^2}$$
+Where,
+$$D = \text{max}(0.001, \text{max}(A_{max}-A_{min}\text{, } B_{max}-B_{min}))$$
+Parameters:
+    - `name: nrmse` - **Required**. Enables nrmse metric.
+	- `tolerance` - **Required**. If value of metric is greater than **tolerance** it will be treated as **FAIL**.
+
+### Example
+Consider the following `config.yaml`:
+```
+model_dir:
+  local: C:\workspace\models
+device_name: NPU
+compiler_type: MLIR
+log_level: INFO
+
+save_validation_outputs: actual-outputs/
+metric: { name: norm, tolerance: 0.01 }
+
+multi_inference:
+- input_stream_list:
+  - network:
+    - { name: A.xml, ip: FP16, input_data: A-inputs/, output_data: A-outputs/ }
+      # overwrites the global metric for the model B.xml
+    - { name: B.xml, ip: FP16, input_data: B-inputs/, output_data: B-outputs/, metric: { name: norm, tolerance: 0.0 }
+```
+
+Use `reference` mode to generate the input random data for every model and calculate reference outputs
+**Note**: If reference device is different, it can be changed in config file (`device_name`) accordingly
+```
+./protopipe --cfg config.yaml --mode reference -niter 10
+```
+Use `validation` mode to perform accuracy validation:
+```
+./protopipe --cfg config.yaml --mode validation -t 15
+```
+Example of successful validation:
+```
+stream 0: Validation has passed for <number> iteration(s)
+```
+In case of accuracy issues the output will be the following:
+```
+stream 0: Accuraccy check failed on <number> iteration(s) (first 10):
+Iteration <number>:
+  Model: A, Layer: <name>, Metric: Norm{tolerance: 0.01}, Reason: <number> > 0.01;
+```
+
+## How to build
+### Prerequisites
+1. Clone `npu-plugin` repository
+2. Build OpenCV G-API with OpenVINO/ONNXRT support
+#### Build OpenCV G-API with OpenVINO/ONNXRT support
+1. Clone OpenCV repo:
+    ```
+    git clone https://github.com/opencv/opencv
+    cd opencv && git checkout 78195bc3df
+    ```
+2. Build OpenCV G-API:
+	```
+    mkdir -p build && cd build
+    cmake ../ -DBUILD_LIST=gapi                             \
+              -DCMAKE_BUILD_TYPE=Release                    \
+              -DWITH_OPENVINO=ON                            \
+              -DOpenVINO_DIR=<path-to-openvino-install-dir> \
+              -DWITH_ONNX=ON                                \
+              -DORT_INSTALL_DIR=<path-to-onnxrt-install-dir>
+    cmake --build . --config Release --target opencv_gapi --parallel
+	```
+### In-plugin build
+
+1. Clone and build [OpenVINO](https://github.com/openvinotoolkit/openvino) from sources
+2. Build OpenCV G-API with OpenVINO / ONNXRT support
+3. Clone `npu-plugin` repository
+	```
+	git clone https://github.com/openvinotoolkit/npu_plugin
+	git submodule update --init --recursive
+	```
+4. Build `Protopipe` as part of the `npu-plugin` build:
+	```
+	mkdir build && cd build
+	cmake ../ -DOpenCV_DIR=<path-to-opencv-build> -DOpenVINODeveloperPackage_DIR=<path-to-openvino-build>
+	cmake --build . --config Release --target protopipe --parallel
+	```
+
+### Standalone build
+1. Build `yaml-cpp`
+	```
+	mkdir -p yaml-cpp_build cd && yaml-cpp_build
+	cmake ../<npu-plugin>/thirdparty/yaml-cpp -DCMAKE_INSTALL_PREFIX=install
+	cmake --build . --config Release --target install --parallel
+	```
+2. Build `gflags`
+	```
+	git clone https://github.com/gflags/gflags
+	cd gflags
+	mkdir -p gflags_build cd && gflags_build
+	cmake ../ -DCMAKE_INSTALL_PREFIX=install
+	cmake --build . --config Release --target install --parallel
+	```
+3. Build `Protopipe`
+	```
+	mkdir -b protopipe_build && cd protopipe_build
+	cmake <npu-plugin>/tools/protopipe/                              \
+	      -DOpenCV_DIR=<path-to-opencv-build                         \
+	      -Dyaml_cpp_DIR=<yaml-cpp_build/install/lib/cmake/yaml-cpp> \
+	      -Dgflags_DIR=<gflags_build/install>                        \
+	      -DOpenVINO_DIR=<path>                                      \
+	           
+	cmake --build . --config Release --target protopipe --parallel
+	```
+### Verify the installation
+**Note**: Make sure `opencv_*` libraries are visible in the environment:
+- Windows: 
+	```
+	set PATH=<path-to-opencv>\build\bin\Release\;%PATH%
+	```
+- Linux:
+	```
+	export LD_LIBRARY_PATH=<path-to-opencv>/build/lib/:$LD_LIBRARY_PATH
+	```
+**Note**: If `OpenCV` has been build with `ONNXRT` support, all `ONNXRT` related libraries must be located in the same folder as `protopipe` executable.
+
+Run `Protopipe` with -h flag to verify installation:
+```
+> protopipe.exe -h
+```
+Successful build will show the information about `Protopipe` CLI options:
+```
+protopipe [OPTIONS]
+
+ Common options:
+    -h                      Optional. Print the usage message.
+    -cfg <value>            Path to the configuration file.
+    -pipeline               Optional. Enable pipelined execution.
+    -drop_frames            Optional. Drop frames if they come earlier than pipeline is completed.
+    -mode <value>           Optional. Simulation mode: performance (default), reference, validation.
+    -niter <value>          Optional. Number of iterations. If specified overwrites termination criterion for all scenarios in configuration file.
+    -t <value>              Optional. Time in seconds. If specified overwrites termination criterion for all scenarios in configuration file.
+    -inference_only         Optional. Run only inference execution for every model excluding i/o data transfer. Applicable only for "performance" mode. (default: true).
+    -exec_filter            Optional. Run the scenarios that match provided string pattern.
+```
diff --git a/src/plugins/intel_npu/tools/protopipe/cmake/standalone.cmake b/src/plugins/intel_npu/tools/protopipe/cmake/standalone.cmake
new file mode 100644
index 00000000000000..090756f86c44c0
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/cmake/standalone.cmake
@@ -0,0 +1,63 @@
+#
+# Copyright (C) 2024 Intel Corporation.
+# SPDX-License-Identifier: Apache 2.0
+#
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+if("${CMAKE_BUILD_TYPE}" STREQUAL "")
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+find_package(Threads REQUIRED)
+find_package(OpenCV 4.9.0 REQUIRED COMPONENTS gapi)
+
+find_package(yaml-cpp QUIET)
+find_package(gflags QUIET)
+
+if (NOT yaml-cpp_FOUND)
+    set(YAML_CPP_SOURCES_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/yaml-cpp")
+    message(STATUS "yaml-cpp package was not found. Trying to find source package in ${YAML_CPP_SOURCES_PATH}.")
+    if(EXISTS ${YAML_CPP_SOURCES_PATH})
+        message(STATUS "yaml-cpp source package found. yaml-cpp will be built from sources.")
+        add_subdirectory(${YAML_CPP_SOURCES_PATH} yaml-cpp EXCLUDE_FROM_ALL)
+    else()
+        message(FATAL_ERROR "yaml-cpp package and sources were not found. CMake will exit." )
+    endif()
+endif()
+
+if (NOT gflags_FOUND)
+    set(GFLAGS_SOURCES_PATH "${PACKAGE_PREFIX_DIR}/samples/cpp/thirdparty/gflags")
+    message(STATUS "gflags package was not found. Trying to find source package in ${GFLAGS_SOURCES_PATH}.")
+    if(EXISTS ${GFLAGS_SOURCES_PATH})
+        message(STATUS "gflags source package found. gflags will be built from sources.")
+        add_subdirectory(${GFLAGS_SOURCES_PATH} gflags EXCLUDE_FROM_ALL)
+    else()
+        message(FATAL_ERROR "gflags was not found. CMake will exit." )
+    endif()
+endif()
+
+set(DEPENDENCIES
+        Threads::Threads
+        gflags
+        yaml-cpp
+        openvino::runtime
+        opencv_gapi
+)
+
+if (WIN32)
+    list(APPEND DEPENDENCIES "winmm.lib")
+endif()
+
+file(GLOB_RECURSE SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
+list(APPEND SOURCES main.cpp)
+
+add_executable(${TARGET_NAME} ${SOURCES})
+target_link_libraries(${TARGET_NAME} PRIVATE ${DEPENDENCIES})
+target_include_directories(${TARGET_NAME} PUBLIC "${PROJECT_SOURCE_DIR}/src/")
+
+install(TARGETS ${TARGET_NAME}
+        DESTINATION "tools/${TARGET_NAME}"
+        COMPONENT npu_tools)
diff --git a/src/plugins/intel_npu/tools/protopipe/main.cpp b/src/plugins/intel_npu/tools/protopipe/main.cpp
new file mode 100644
index 00000000000000..8596ba864335ca
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/main.cpp
@@ -0,0 +1,266 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation.
+// SPDX-License-Identifier: Apache 2.0
+//
+
+#include <future>
+#include <iostream>
+#include <regex>
+
+#include <gflags/gflags.h>
+
+#include "parser/parser.hpp"
+#include "scenario/scenario_graph.hpp"
+#include "simulation/performance_mode.hpp"
+#include "simulation/reference_mode.hpp"
+#include "simulation/validation_mode.hpp"
+
+#include "utils/error.hpp"
+#include "utils/logger.hpp"
+
+static constexpr char help_message[] = "Optional. Print the usage message.";
+static constexpr char cfg_message[] = "Path to the configuration file.";
+static constexpr char device_message[] =
+        "Optional. Device name. If specified overwrites device specified in config file.";
+static constexpr char pipeline_message[] = "Optional. Enable pipelined execution.";
+static constexpr char drop_message[] = "Optional. Drop frames if they come earlier than pipeline is completed.";
+static constexpr char mode_message[] = "Optional. Simulation mode: performance (default), reference, validation.";
+static constexpr char niter_message[] = "Optional. Number of iterations. If specified overwrites termination criterion"
+                                        " for all scenarios in configuration file.";
+static constexpr char exec_time_message[] = "Optional. Time in seconds. If specified overwrites termination criterion"
+                                            " for all scenarios in configuration file.";
+static constexpr char inference_only_message[] =
+        "Optional. Run only inference execution for every model excluding i/o data transfer."
+        " Applicable only for \"performance\" mode. (default: true).";
+
+static constexpr char exec_filter_msg[] = "Optional. Run the scenarios that match provided string pattern.";
+
+DEFINE_bool(h, false, help_message);
+DEFINE_string(cfg, "", cfg_message);
+DEFINE_string(d, "", device_message);
+DEFINE_bool(pipeline, false, pipeline_message);
+DEFINE_bool(drop_frames, false, drop_message);
+DEFINE_string(mode, "performance", mode_message);
+DEFINE_uint64(niter, 0, niter_message);
+DEFINE_uint64(t, 0, exec_time_message);
+DEFINE_bool(inference_only, true, inference_only_message);
+DEFINE_string(exec_filter, ".*", exec_filter_msg);
+
+static void showUsage() {
+    std::cout << "protopipe [OPTIONS]" << std::endl;
+    std::cout << std::endl;
+    std::cout << " Common options:            " << std::endl;
+    std::cout << "    -h                      " << help_message << std::endl;
+    std::cout << "    -cfg <value>            " << cfg_message << std::endl;
+    std::cout << "    -pipeline               " << pipeline_message << std::endl;
+    std::cout << "    -drop_frames            " << drop_message << std::endl;
+    std::cout << "    -d <value>              " << device_message << std::endl;
+    std::cout << "    -mode <value>           " << mode_message << std::endl;
+    std::cout << "    -niter <value>          " << niter_message << std::endl;
+    std::cout << "    -t <value>              " << exec_time_message << std::endl;
+    std::cout << "    -inference_only         " << inference_only_message << std::endl;
+    std::cout << "    -exec_filter            " << exec_filter_msg << std::endl;
+    std::cout << std::endl;
+}
+
+bool parseCommandLine(int* argc, char*** argv) {
+    gflags::ParseCommandLineNonHelpFlags(argc, argv, true);
+
+    if (FLAGS_h) {
+        showUsage();
+        return false;
+    }
+
+    if (FLAGS_cfg.empty()) {
+        throw std::invalid_argument("Path to config file is required");
+    }
+
+    std::cout << "Parameters:" << std::endl;
+    std::cout << "    Config file:             " << FLAGS_cfg << std::endl;
+    std::cout << "    Pipelining is enabled:   " << std::boolalpha << FLAGS_pipeline << std::endl;
+    std::cout << "    Simulation mode:         " << FLAGS_mode << std::endl;
+    std::cout << "    Inference only:          " << std::boolalpha << FLAGS_inference_only << std::endl;
+    std::cout << "    Device:                  " << FLAGS_d << std::endl;
+    return true;
+}
+
+static ICompiled::Ptr compileSimulation(Simulation::Ptr simulation, const bool pipelined, const bool drop_frames) {
+    LOG_INFO() << "Compile simulation" << std::endl;
+    if (pipelined) {
+        return simulation->compilePipelined(drop_frames);
+    }
+    return simulation->compileSync(drop_frames);
+};
+
+class ThreadRunner {
+public:
+    using F = std::function<void()>;
+    void add(F&& func) {
+        m_funcs.push_back(std::move(func));
+    }
+    void run();
+
+private:
+    std::vector<F> m_funcs;
+};
+
+void ThreadRunner::run() {
+    std::vector<std::future<void>> futures;
+    futures.reserve(m_funcs.size());
+    for (auto&& func : m_funcs) {
+        futures.push_back(std::async(std::launch::async, std::move(func)));
+    }
+    for (auto& future : futures) {
+        future.get();
+    };
+};
+
+class Task {
+public:
+    Task(ICompiled::Ptr&& compiled, std::string&& name, ITermCriterion::Ptr&& criterion);
+
+    void operator()();
+    const Result& result() const;
+    const std::string& name() const;
+
+private:
+    ICompiled::Ptr m_compiled;
+    std::string m_name;
+    ITermCriterion::Ptr m_criterion;
+
+    Result m_result;
+};
+
+Task::Task(ICompiled::Ptr&& compiled, std::string&& name, ITermCriterion::Ptr&& criterion)
+        : m_compiled(std::move(compiled)), m_name(std::move(name)), m_criterion(std::move(criterion)) {
+}
+
+void Task::operator()() {
+    try {
+        m_result = m_compiled->run(m_criterion);
+    } catch (const std::exception& e) {
+        m_result = Error{e.what()};
+    }
+}
+
+const Result& Task::result() const {
+    return m_result;
+}
+
+const std::string& Task::name() const {
+    return m_name;
+}
+
+static Simulation::Ptr createSimulation(const std::string& mode, StreamDesc&& stream, const bool inference_only,
+                                        const Config& config) {
+    Simulation::Ptr simulation;
+    // NB: Common parameters for all simulations
+    Simulation::Config cfg{stream.name, stream.frames_interval_in_us, config.disable_high_resolution_timer,
+                           std::move(stream.graph), std::move(stream.infer_params_map)};
+    if (mode == "performance") {
+        PerformanceSimulation::Options opts{config.initializer, std::move(stream.initializers_map),
+                                            std::move(stream.input_data_map), inference_only,
+                                            std::move(stream.target_latency)};
+        simulation = std::make_shared<PerformanceSimulation>(std::move(cfg), std::move(opts));
+    } else if (mode == "reference") {
+        CalcRefSimulation::Options opts{config.initializer, std::move(stream.initializers_map),
+                                        std::move(stream.input_data_map), std::move(stream.output_data_map)};
+        simulation = std::make_shared<CalcRefSimulation>(std::move(cfg), std::move(opts));
+    } else if (mode == "validation") {
+        ValSimulation::Options opts{config.metric, std::move(stream.metrics_map), std::move(stream.input_data_map),
+                                    std::move(stream.output_data_map), std::move(stream.per_iter_outputs_path)};
+        simulation = std::make_shared<ValSimulation>(std::move(cfg), std::move(opts));
+    } else {
+        throw std::logic_error("Unsupported simulation mode: " + mode);
+    }
+    ASSERT(simulation);
+    return simulation;
+}
+
+int main(int argc, char* argv[]) {
+    // NB: Intentionally wrapped into try-catch to display exceptions occur on windows.
+    try {
+        if (!parseCommandLine(&argc, &argv)) {
+            return 0;
+        }
+        ReplaceBy replace_by{FLAGS_d};
+
+        auto parser = std::make_shared<ScenarioParser>(FLAGS_cfg);
+
+        LOG_INFO() << "Parse scenarios from " << FLAGS_cfg << " config file" << std::endl;
+        auto config = parser->parseScenarios(replace_by);
+        LOG_INFO() << "Found " << config.scenarios.size() << " scenario(s)" << std::endl;
+
+        // NB: Overwrite termination criteria for all scenarios if specified via CLI
+        ITermCriterion::Ptr global_criterion;
+        if (FLAGS_niter != 0u) {
+            LOG_INFO() << "Termination criterion of " << FLAGS_niter << " iteration(s) will be used for all scenarios"
+                       << std::endl;
+            global_criterion = std::make_shared<Iterations>(FLAGS_niter);
+        }
+        if (FLAGS_t != 0u) {
+            if (global_criterion) {
+                // TODO: In fact, it make sense to have them both enabled.
+                THROW_ERROR("-niter and -t options can't be specified together!");
+            }
+            LOG_INFO() << "Termination criterion of " << FLAGS_t << " second(s) will be used for all scenarios"
+                       << std::endl;
+            // NB: TimeOut accepts microseconds
+            global_criterion = std::make_shared<TimeOut>(FLAGS_t * 1'000'000);
+        }
+
+        std::regex filter_regex{FLAGS_exec_filter};
+        bool any_scenario_failed = false;
+        for (auto&& scenario : config.scenarios) {
+            // NB: Skip the scenarios that don't match provided filter pattern
+            if (!std::regex_match(scenario.name, filter_regex)) {
+                LOG_INFO() << "Skip the scenario " << scenario.name << " as it doesn't match the -exec_filter=\""
+                           << FLAGS_exec_filter << "\" pattern" << std::endl;
+                continue;
+            }
+            LOG_INFO() << "Start processing " << scenario.name << std::endl;
+
+            ThreadRunner runner;
+            std::vector<Task> tasks;
+            tasks.reserve(scenario.streams.size());
+            for (auto&& stream : scenario.streams) {
+                auto criterion = stream.criterion;
+                auto stream_name = stream.name;
+                if (global_criterion) {
+                    if (criterion) {
+                        LOG_INFO() << "Stream: " << stream_name
+                                   << " termination criterion is overwritten by CLI parameter" << std::endl;
+                    }
+                    criterion = global_criterion->clone();
+                }
+                auto simulation = createSimulation(FLAGS_mode, std::move(stream), FLAGS_inference_only, config);
+                auto compiled = compileSimulation(simulation, FLAGS_pipeline, FLAGS_drop_frames);
+                tasks.emplace_back(std::move(compiled), std::move(stream_name), std::move(criterion));
+                runner.add(std::ref(tasks.back()));
+            }
+
+            LOG_INFO() << "Run " << tasks.size() << " stream(s) asynchronously" << std::endl;
+            runner.run();
+            LOG_INFO() << "Execution has finished" << std::endl;
+
+            for (const auto& task : tasks) {
+                if (!task.result()) {
+                    // NB: Scenario failed if any of the streams failed
+                    any_scenario_failed = true;
+                }
+                std::cout << "stream " << task.name() << ": " << task.result().str() << std::endl;
+            }
+            std::cout << "\n";
+        }
+        if (any_scenario_failed) {
+            return EXIT_FAILURE;
+        }
+    } catch (const std::exception& e) {
+        std::cout << e.what() << std::endl;
+        throw;
+    } catch (...) {
+        std::cout << "Unknown error" << std::endl;
+        throw;
+    }
+    return 0;
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/graph.cpp b/src/plugins/intel_npu/tools/protopipe/src/graph.cpp
new file mode 100644
index 00000000000000..d13d2954a21b12
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/graph.cpp
@@ -0,0 +1,140 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <stack>
+
+#include "graph.hpp"
+
+Nodes Node::srcNodes() const {
+    Nodes src_nodes;
+    src_nodes.reserve(m_src_edges.size());
+    std::transform(m_src_edges.begin(), m_src_edges.end(), std::back_inserter(src_nodes), [](EdgeHandle edge) {
+        return edge->srcNode();
+    });
+    return src_nodes;
+}
+
+Nodes Node::dstNodes() const {
+    Nodes dst_nodes;
+    dst_nodes.reserve(m_dst_edges.size());
+    std::transform(m_dst_edges.begin(), m_dst_edges.end(), std::back_inserter(dst_nodes), [](EdgeHandle edge) {
+        return edge->dstNode();
+    });
+    return dst_nodes;
+}
+
+Edges Node::srcEdges() const {
+    return {m_src_edges.begin(), m_src_edges.end()};
+}
+
+Edges Node::dstEdges() const {
+    return {m_dst_edges.begin(), m_dst_edges.end()};
+}
+
+NodeHandle Graph::create() {
+    auto node = std::make_shared<Node>();
+    NodeHandle nh(node);
+    m_nodes.emplace(node.get(), MetaPtr<Node>{node, Meta{}});
+    return nh;
+}
+
+void Graph::remove(NodeHandle nh) {
+    auto src_edges = nh->srcEdges();
+    for (size_t i = 0; i < src_edges.size(); ++i) {
+        remove(src_edges[i]);
+    }
+    auto dst_edges = nh->dstEdges();
+    for (size_t i = 0; i < dst_edges.size(); ++i) {
+        remove(dst_edges[i]);
+    }
+    m_nodes.erase(nh.get());
+}
+
+void Graph::remove(EdgeHandle eh) {
+    auto src = eh->srcNode();
+    auto dst = eh->dstNode();
+    src->m_dst_edges.erase(eh);
+    dst->m_src_edges.erase(eh);
+    m_edges.erase(eh.get());
+};
+
+EdgeHandle Graph::link(NodeHandle src, NodeHandle dst) {
+    auto edge = std::make_shared<Edge>(src, dst);
+    EdgeHandle eh{edge};
+    m_edges.emplace(edge.get(), MetaPtr<Edge>{edge, Meta{}});
+    src->m_dst_edges.insert(eh);
+    dst->m_src_edges.insert(eh);
+    return eh;
+}
+
+Meta& Graph::meta(NodeHandle handle) {
+    const auto it = m_nodes.find(handle.get());
+    ASSERT(it != m_nodes.end());
+    return it->second.meta;
+}
+
+const Meta& Graph::meta(NodeHandle handle) const {
+    const auto it = m_nodes.find(handle.get());
+    ASSERT(it != m_nodes.end());
+    return it->second.meta;
+}
+
+Meta& Graph::meta(EdgeHandle handle) {
+    const auto it = m_edges.find(handle.get());
+    ASSERT(it != m_edges.end());
+    return it->second.meta;
+}
+
+const Meta& Graph::meta(EdgeHandle handle) const {
+    const auto it = m_edges.find(handle.get());
+    ASSERT(it != m_edges.end());
+    return it->second.meta;
+}
+
+std::vector<NodeHandle> Graph::nodes() const {
+    std::vector<NodeHandle> ret;
+    std::transform(m_nodes.begin(), m_nodes.end(), std::back_inserter(ret), [](const auto& p) {
+        return NodeHandle{p.second.ptr};
+    });
+    return ret;
+}
+
+static void dfs(NodeHandle& nh, std::unordered_set<NodeHandle>& visited, std::stack<NodeHandle>& stack) {
+    visited.insert(nh);
+    auto dst_nodes = nh->dstNodes();
+    for (auto dst_nh : dst_nodes) {
+        auto it = visited.find(dst_nh);
+        if (it == visited.end()) {
+            dfs(dst_nh, visited, stack);
+        }
+    }
+    stack.push(nh);
+};
+
+std::vector<NodeHandle> Graph::sorted() const {
+    std::unordered_set<NodeHandle> visited;
+    std::stack<NodeHandle> stack;
+    const auto nodes = this->nodes();
+    for (auto nh : nodes) {
+        auto it = visited.find(nh);
+        if (it == visited.end()) {
+            dfs(nh, visited, stack);
+        }
+    }
+    std::vector<NodeHandle> sorted;
+    while (!stack.empty()) {
+        sorted.push_back(stack.top());
+        stack.pop();
+    }
+    return sorted;
+}
+
+Meta& Meta::operator+=(const Meta& other) {
+    for (const auto& p : other.store) {
+        ASSERT(store.emplace(p.first, p.second).second);
+    }
+    return *this;
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/graph.hpp b/src/plugins/intel_npu/tools/protopipe/src/graph.hpp
new file mode 100644
index 00000000000000..66aeccbe156d09
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/graph.hpp
@@ -0,0 +1,168 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <any>
+#include <functional>
+#include <memory>
+#include <typeindex>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "utils/error.hpp"
+
+template <typename T>
+class WeakHandle {
+public:
+    explicit WeakHandle(std::shared_ptr<T> obj): m_obj(obj) {
+    }
+    T* get() const {
+        return m_obj.lock().get();
+    }
+    T* operator->() const {
+        return get();
+    }
+    bool operator==(const WeakHandle& other) const {
+        return get() == other.get();
+    }
+
+private:
+    std::weak_ptr<T> m_obj;
+};
+
+namespace std {
+template <typename T>
+struct hash<WeakHandle<T>> {
+    uint64_t operator()(const WeakHandle<T>& handle) const {
+        return std::hash<T*>()(handle.get());
+    }
+};
+}  // namespace std
+
+class Graph;
+class Node;
+class Edge;
+
+using NodeHandle = WeakHandle<Node>;
+using EdgeHandle = WeakHandle<Edge>;
+using Nodes = std::vector<NodeHandle>;
+using Edges = std::vector<EdgeHandle>;
+using NodeSet = std::unordered_set<NodeHandle>;
+using EdgeSet = std::unordered_set<EdgeHandle>;
+
+class Node {
+    friend class Graph;
+    using Ptr = std::shared_ptr<Node>;
+
+public:
+    Nodes srcNodes() const;
+    Nodes dstNodes() const;
+    Edges srcEdges() const;
+    Edges dstEdges() const;
+
+private:
+    EdgeSet m_src_edges;
+    EdgeSet m_dst_edges;
+};
+
+class Edge {
+    friend class Graph;
+    using Ptr = std::shared_ptr<Edge>;
+
+public:
+    Edge(NodeHandle src, NodeHandle dst): m_src(src), m_dst(dst) {
+    }
+    NodeHandle srcNode() const {
+        return m_src;
+    }
+    NodeHandle dstNode() const {
+        return m_dst;
+    }
+
+private:
+    NodeHandle m_src;
+    NodeHandle m_dst;
+};
+
+class Meta {
+public:
+    template <typename T>
+    void set(T&& meta);
+    template <typename T>
+    const T& get() const;
+    template <typename T>
+    T& get();
+    template <typename T>
+    bool has() const;
+    Meta& operator+=(const Meta& other);
+
+private:
+    using MetaStore = std::unordered_map<std::type_index, std::any>;
+    MetaStore store;
+};
+
+template <typename T>
+void Meta::set(T&& meta) {
+    // NB: Check if there is no such meta yet.
+    ASSERT(store.emplace(std::type_index(typeid(T)), std::forward<T>(meta)).second);
+}
+
+template <typename T>
+bool Meta::has() const {
+    auto it = store.find(std::type_index(typeid(T)));
+    return it != store.end();
+}
+
+template <typename T>
+const T& Meta::get() const {
+    const auto it = store.find(std::type_index(typeid(T)));
+    ASSERT(it != store.end());
+    return *std::any_cast<T>(&it->second);
+}
+
+template <typename T>
+T& Meta::get() {
+    auto it = store.find(std::type_index(typeid(T)));
+    ASSERT(it != store.end());
+    return *std::any_cast<T>(&it->second);
+}
+
+class Graph {
+public:
+    NodeHandle create();
+    void remove(NodeHandle nh);
+    void remove(EdgeHandle eh);
+    EdgeHandle link(NodeHandle src, NodeHandle dst);
+
+    Meta& meta() {
+        return m_graph_meta;
+    }
+    const Meta& meta() const {
+        return m_graph_meta;
+    }
+
+    Meta& meta(NodeHandle handle);
+    const Meta& meta(NodeHandle handle) const;
+    Meta& meta(EdgeHandle handle);
+    const Meta& meta(EdgeHandle handle) const;
+
+    std::vector<NodeHandle> nodes() const;
+    std::vector<NodeHandle> sorted() const;
+
+private:
+    template <typename T>
+    struct MetaPtr {
+        std::shared_ptr<T> ptr;
+        Meta meta;
+    };
+    template <typename T>
+    using MetaMap = std::unordered_map<T*, MetaPtr<T>>;
+
+    Meta m_graph_meta;
+    MetaMap<Node> m_nodes;
+    MetaMap<Edge> m_edges;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp b/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp
new file mode 100644
index 00000000000000..34099d36a69fdb
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp
@@ -0,0 +1,872 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "parser/config.hpp"
+
+#include "utils/error.hpp"
+#include "utils/logger.hpp"
+
+#include <filesystem>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <opencv2/opencv.hpp>  // depth
+
+namespace fs = std::filesystem;
+
+struct GlobalOptions {
+    std::string blob_dir = ".";
+    std::string model_dir = ".";
+    std::string device_name = "NPU";
+    std::string log_level = "NONE";
+    std::string compiler_type = "DRIVER";
+    std::optional<std::filesystem::path> save_validation_outputs;
+};
+
+struct Network {
+    std::string tag;
+    InferenceParams params;
+    LayerVariantAttr<std::string> input_data;
+    LayerVariantAttr<std::string> output_data;
+    LayerVariantAttr<IRandomGenerator::Ptr> initializers;
+    LayerVariantAttr<IAccuracyMetric::Ptr> accuracy_metrics;
+};
+
+struct InferOp {
+    InferenceParams params;
+    LayerVariantAttr<std::string> input_data;
+    LayerVariantAttr<std::string> output_data;
+    LayerVariantAttr<IRandomGenerator::Ptr> initializers;
+    LayerVariantAttr<IAccuracyMetric::Ptr> accuracy_metrics;
+};
+
+struct CPUOp {
+    uint64_t time_in_us;
+};
+
+struct CompoundOp {
+    uint64_t repeat_count;
+    InferenceParamsMap params;
+    ScenarioGraph subgraph;
+};
+
+struct OpDesc {
+    std::string tag;
+    using OpType = std::variant<InferOp, CPUOp, CompoundOp>;
+    OpType op;
+};
+
+// NB: Handles duplicating tags.
+class TagsManager {
+public:
+    std::string add(const std::string& tag);
+
+private:
+    std::unordered_multiset<std::string> m_tags;
+};
+
+std::string TagsManager::add(const std::string& tag) {
+    std::string t = tag;
+    m_tags.insert(t);
+    const auto c = m_tags.count(t);
+    if (c > 1) {
+        t += "-" + std::to_string(c);
+    }
+    return t;
+}
+
+static LogLevel toLogLevel(const std::string& lvl) {
+    if (lvl == "NONE")
+        return LogLevel::None;
+    if (lvl == "INFO")
+        return LogLevel::Info;
+    if (lvl == "DEBUG")
+        return LogLevel::Debug;
+    THROW_ERROR("Unsupported log level: " << lvl);
+}
+
+static int toDepth(const std::string& prec) {
+    if (prec == "FP32")
+        return CV_32F;
+    if (prec == "FP16")
+        return CV_16F;
+    if (prec == "U8")
+        return CV_8U;
+    if (prec == "I32")
+        return CV_32S;
+    throw std::logic_error("Unsupported precision type: " + prec);
+}
+
+static AttrMap<int> toDepth(const AttrMap<std::string>& attrmap) {
+    AttrMap<int> depthmap;
+    for (const auto& [name, str_depth] : attrmap) {
+        depthmap.emplace(name, toDepth(str_depth));
+    }
+    return depthmap;
+}
+
+static LayerVariantAttr<int> toDepth(const LayerVariantAttr<std::string>& attr) {
+    LayerVariantAttr<int> depthattr;
+    if (std::holds_alternative<std::string>(attr)) {
+        depthattr = toDepth(std::get<std::string>(attr));
+    } else {
+        depthattr = toDepth(std::get<AttrMap<std::string>>(attr));
+    }
+    return depthattr;
+}
+
+static std::string toPriority(const std::string& priority) {
+    if (priority == "LOW") {
+        return "LOW";
+    }
+    if (priority == "NORMAL") {
+        return "MEDIUM";
+    }
+    if (priority == "HIGH") {
+        return "HIGH";
+    }
+    throw std::logic_error("Unsupported model priority: " + priority);
+}
+
+static ScenarioGraph buildGraph(const std::vector<OpDesc>& op_descs,
+                                const std::vector<std::vector<std::string>>& connections);
+
+namespace YAML {
+
+template <typename T>
+struct convert<std::vector<T>> {
+    static bool decode(const Node& node, std::vector<T>& vec) {
+        if (!node.IsSequence()) {
+            return false;
+        }
+
+        for (auto& child : node) {
+            vec.push_back(child.as<T>());
+        }
+        return true;
+    }
+};
+
+template <typename K, typename V>
+struct convert<std::map<K, V>> {
+    static bool decode(const Node& node, std::map<K, V>& map) {
+        if (!node.IsMap()) {
+            return false;
+        }
+        for (const auto& itr : node) {
+            map.emplace(itr.first.as<K>(), itr.second.as<V>());
+        }
+        return true;
+    }
+};
+
+template <typename T>
+struct convert<LayerVariantAttr<T>> {
+    static bool decode(const Node& node, LayerVariantAttr<T>& layer_attr) {
+        if (node.IsMap()) {
+            layer_attr = node.as<std::map<std::string, T>>();
+        } else {
+            layer_attr = node.as<T>();
+        }
+        return true;
+    }
+};
+
+template <>
+struct convert<UniformGenerator::Ptr> {
+    static bool decode(const Node& node, UniformGenerator::Ptr& generator) {
+        if (!node["low"]) {
+            THROW_ERROR("Uniform distribution must have \"low\" attribute");
+        }
+        if (!node["high"]) {
+            THROW_ERROR("Uniform distribution must have \"high\" attribute");
+        }
+        generator = std::make_shared<UniformGenerator>(node["low"].as<double>(), node["high"].as<double>());
+        return true;
+    }
+};
+
+template <>
+struct convert<IRandomGenerator::Ptr> {
+    static bool decode(const Node& node, IRandomGenerator::Ptr& generator) {
+        if (!node["dist"]) {
+            THROW_ERROR("\"random\" must have \"dist\" attribute!");
+        }
+        const auto dist = node["dist"].as<std::string>();
+        if (dist == "uniform") {
+            generator = node.as<UniformGenerator::Ptr>();
+        } else {
+            THROW_ERROR("Unsupported random distribution: \"" << dist << "\"");
+        }
+        return true;
+    }
+};
+
+template <>
+struct convert<Norm::Ptr> {
+    static bool decode(const Node& node, Norm::Ptr& metric) {
+        // NB: If bigger than tolerance - fail.
+        if (!node["tolerance"]) {
+            THROW_ERROR("Metric \"norm\" must have \"tolerance\" attribute!");
+        }
+        const auto tolerance = node["tolerance"].as<double>();
+        metric = std::make_shared<Norm>(tolerance);
+        return true;
+    }
+};
+
+template <>
+struct convert<Cosine::Ptr> {
+    static bool decode(const Node& node, Cosine::Ptr& metric) {
+        // NB: If lower than threshold - fail.
+        if (!node["threshold"]) {
+            THROW_ERROR("Metric \"cosine\" must have \"threshold\" attribute!");
+        }
+        const auto threshold = node["threshold"].as<double>();
+        metric = std::make_shared<Cosine>(threshold);
+        return true;
+    }
+};
+
+template <>
+struct convert<NRMSE::Ptr> {
+    static bool decode(const Node& node, NRMSE::Ptr& metric) {
+        // NB: If bigger than tolerance - fail.
+        if (!node["tolerance"]) {
+            THROW_ERROR("Metric \"nrmse\" must have \"tolerance\" attribute!");
+        }
+        const auto tolerance = node["tolerance"].as<double>();
+        metric = std::make_shared<NRMSE>(tolerance);
+        return true;
+    }
+};
+
+template <>
+struct convert<IAccuracyMetric::Ptr> {
+    static bool decode(const Node& node, IAccuracyMetric::Ptr& metric) {
+        const auto type = node["name"].as<std::string>();
+        if (type == "norm") {
+            metric = node.as<Norm::Ptr>();
+        } else if (type == "cosine") {
+            metric = node.as<Cosine::Ptr>();
+        } else if (type == "nrmse") {
+            metric = node.as<NRMSE::Ptr>();
+        } else {
+            THROW_ERROR("Unsupported metric type: " << type);
+        }
+        return true;
+    }
+};
+
+template <>
+struct convert<GlobalOptions> {
+    static bool decode(const Node& node, GlobalOptions& opts) {
+        if (node["model_dir"]) {
+            if (!node["model_dir"]["local"]) {
+                THROW_ERROR("\"model_dir\" must contain \"local\" key!");
+            }
+            opts.model_dir = node["model_dir"]["local"].as<std::string>();
+        }
+
+        if (node["blob_dir"]) {
+            if (!node["blob_dir"]["local"]) {
+                THROW_ERROR("\"blob_dir\" must contain \"local\" key!");
+            }
+            opts.blob_dir = node["blob_dir"]["local"].as<std::string>();
+        }
+
+        if (node["device_name"]) {
+            opts.device_name = node["device_name"].as<std::string>();
+        }
+
+        if (node["log_level"]) {
+            opts.log_level = node["log_level"].as<std::string>();
+        }
+
+        if (node["compiler_type"]) {
+            opts.compiler_type = node["compiler_type"].as<std::string>();
+        }
+
+        if (node["save_validation_outputs"]) {
+            const auto path = node["save_validation_outputs"].as<std::string>();
+            opts.save_validation_outputs = std::make_optional(std::filesystem::path{path});
+        }
+
+        return true;
+    }
+};
+
+template <>
+struct convert<OpenVINOParams> {
+    static bool decode(const Node& node, OpenVINOParams& params) {
+        // FIXME: Worth to separate these two
+        const auto name = node["name"] ? node["name"].as<std::string>() : node["path"].as<std::string>();
+        fs::path path{name};
+        if (path.extension() == ".xml") {
+            auto bin_path = path;
+            bin_path.replace_extension(".bin");
+            params.path = OpenVINOParams::ModelPath{path.string(), bin_path.string()};
+        } else if (path.extension() == ".blob") {
+            params.path = OpenVINOParams::BlobPath{path.string()};
+        } else {
+            // NB: *.onnx, *.pdpd, and any other format supported in future
+            params.path = OpenVINOParams::ModelPath{path.string(), "" /*weights*/};
+        }
+        // NB: If "device" isn't presented in config for network,
+        // the device specified globally will be substitued later on
+        if (node["device"]) {
+            params.device = node["device"].as<std::string>();
+        }
+
+        if (node["ip"]) {
+            params.input_precision = toDepth(node["ip"].as<LayerVariantAttr<std::string>>());
+        }
+
+        if (node["op"]) {
+            params.output_precision = toDepth(node["op"].as<LayerVariantAttr<std::string>>());
+        }
+
+        if (node["il"]) {
+            params.input_layout = node["il"].as<LayerVariantAttr<std::string>>();
+        }
+
+        if (node["ol"]) {
+            params.output_layout = node["ol"].as<LayerVariantAttr<std::string>>();
+        }
+
+        if (node["iml"]) {
+            params.input_model_layout = node["iml"].as<LayerVariantAttr<std::string>>();
+        }
+
+        if (node["oml"]) {
+            params.output_model_layout = node["oml"].as<LayerVariantAttr<std::string>>();
+        }
+
+        if (node["config"]) {
+            params.config = node["config"].as<std::map<std::string, std::string>>();
+        }
+
+        // NB: Note, it should be handled after "config" is set above
+        if (node["priority"]) {
+            params.config.emplace("MODEL_PRIORITY", toPriority(node["priority"].as<std::string>()));
+        }
+
+        if (node["nireq"]) {
+            params.nireq = node["nireq"].as<size_t>();
+        }
+        return true;
+    }
+};
+
+template <>
+struct convert<ONNXRTParams::OpenVINO> {
+    static bool decode(const Node& node, ONNXRTParams::OpenVINO& ov_ep) {
+        if (node["params"]) {
+            ov_ep.params_map = node["params"].as<std::map<std::string, std::string>>();
+        }
+        if (node["device_type"]) {
+            std::string device_type = node["device_type"].as<std::string>();
+            // Check if device_type already exists in params_map (collision check)
+            if (ov_ep.params_map.count("device_type") > 0) {
+                THROW_ERROR("Configuration error: 'device_type' has already been specified in the params.");
+            } else {
+                ov_ep.params_map["device_type"] = device_type;
+            }
+        }
+        return true;
+    }
+};
+
+template <>
+struct convert<ONNXRTParams::EP> {
+    static bool decode(const Node& node, ONNXRTParams::EP& ep) {
+        const auto ep_name = node["name"].as<std::string>();
+        if (ep_name == "OV") {
+            ep = node.as<ONNXRTParams::OpenVINO>();
+        } else {
+            THROW_ERROR("Unsupported \"ep name\" value: " << ep_name);
+        }
+        return true;
+    }
+};
+
+template <>
+struct convert<ONNXRTParams> {
+    static bool decode(const Node& node, ONNXRTParams& params) {
+        // FIXME: Worth to separate these two
+        params.model_path = node["name"] ? node["name"].as<std::string>() : node["path"].as<std::string>();
+        if (node["session_options"]) {
+            params.session_options = node["session_options"].as<std::map<std::string, std::string>>();
+        }
+        if (node["ep"]) {
+            params.ep = node["ep"].as<ONNXRTParams::EP>();
+        }
+        return true;
+    }
+};
+
+template <>
+struct convert<Network> {
+    static bool decode(const Node& node, Network& network) {
+        // NB: Take path stem as network tag
+        // Note that at this point, it's fine if names aren't unique
+        const auto name = node["name"].as<std::string>();
+        network.tag = std::filesystem::path{name}.stem().string();
+        // NB: OpenVINO is default to keep back compatibility for config syntax
+        const auto framework = node["framework"] ? node["framework"].as<std::string>() : "openvino";
+        if (framework == "openvino") {
+            // NB: Parse OpenVINO model parameters such as path, device, precision, etc
+            network.params = node.as<OpenVINOParams>();
+        } else if (framework == "onnxrt") {
+            network.params = node.as<ONNXRTParams>();
+        } else {
+            THROW_ERROR("Unsupported \"framework:\" value: " << framework);
+        }
+
+        if (node["random"]) {
+            network.initializers = node["random"].as<LayerVariantAttr<IRandomGenerator::Ptr>>();
+        }
+        if (node["metric"]) {
+            network.accuracy_metrics = node["metric"].as<LayerVariantAttr<IAccuracyMetric::Ptr>>();
+        }
+        if (node["input_data"]) {
+            network.input_data = node["input_data"].as<LayerVariantAttr<std::string>>();
+        }
+
+        if (node["output_data"]) {
+            network.output_data = node["output_data"].as<LayerVariantAttr<std::string>>();
+        }
+        return true;
+    }
+};
+
+template <>
+struct convert<CPUOp> {
+    static bool decode(const Node& node, CPUOp& op) {
+        // TODO: Assert there are no more options provided
+        op.time_in_us = node["time_in_us"] ? node["time_in_us"].as<uint64_t>() : 0u;
+        return true;
+    }
+};
+
+template <>
+struct convert<InferOp> {
+    static bool decode(const Node& node, InferOp& op) {
+        const auto framework = node["framework"] ? node["framework"].as<std::string>() : "openvino";
+        if (framework == "openvino") {
+            // NB: Parse OpenVINO model parameters such as path, device, precision, etc
+            op.params = node.as<OpenVINOParams>();
+        } else if (framework == "onnxrt") {
+            op.params = node.as<ONNXRTParams>();
+        } else {
+            THROW_ERROR("Unsupported \"framework:\" value: " << framework);
+        }
+
+        if (node["random"]) {
+            op.initializers = node["random"].as<LayerVariantAttr<IRandomGenerator::Ptr>>();
+        }
+        if (node["metric"]) {
+            op.accuracy_metrics = node["metric"].as<LayerVariantAttr<IAccuracyMetric::Ptr>>();
+        }
+        if (node["input_data"]) {
+            op.input_data = node["input_data"].as<LayerVariantAttr<std::string>>();
+        }
+
+        if (node["output_data"]) {
+            op.output_data = node["output_data"].as<LayerVariantAttr<std::string>>();
+        }
+        return true;
+    }
+};
+
+template <>
+struct convert<OpDesc> {
+    static bool decode(const Node& node, OpDesc& opdesc) {
+        opdesc.tag = node["tag"].as<std::string>();
+        auto type = node["type"] ? node["type"].as<std::string>() : "Infer";
+        auto repeat_count = node["repeat_count"] ? node["repeat_count"].as<uint64_t>() : 1u;
+        ASSERT(repeat_count > 0)
+        if (repeat_count > 1u) {
+            // NB: repeat_count > 1u assume that "Compound" operation will be used
+            type = "Compound";
+        }
+        if (type == "Infer") {
+            opdesc.op = node.as<InferOp>();
+        } else if (type == "CPU") {
+            opdesc.op = node.as<CPUOp>();
+        } else if (type == "Compound") {
+            std::vector<std::vector<std::string>> connections;
+            if (node["connections"]) {
+                connections = node["connections"].as<std::vector<std::vector<std::string>>>();
+            }
+            auto op_descs = node["op_desc"].as<std::vector<OpDesc>>();
+            InferenceParamsMap inference_params;
+            for (const auto& op_desc : op_descs) {
+                if (std::holds_alternative<InferOp>(op_desc.op)) {
+                    inference_params.emplace(op_desc.tag, std::get<InferOp>(op_desc.op).params);
+                }
+            }
+            opdesc.op = CompoundOp{repeat_count, std::move(inference_params), buildGraph(op_descs, connections)};
+        } else {
+            THROW_ERROR("Unsupported operation type: \"" << type << "\"!");
+        }
+        return true;
+    }
+};
+
+}  // namespace YAML
+
+static std::vector<std::vector<Network>> parseNetworks(const YAML::Node& node) {
+    ASSERT(node.IsSequence());
+    TagsManager tgs_mngr;
+    std::vector<std::vector<Network>> networks_list;
+    for (const auto& subnode : node) {
+        if (subnode.IsSequence()) {
+            networks_list.push_back(subnode.as<std::vector<Network>>());
+        } else {
+            networks_list.push_back({subnode.as<Network>()});
+        }
+        // NB: Ensure all network tags are unique!
+        for (auto& network : networks_list.back()) {
+            network.tag = tgs_mngr.add(network.tag);
+        }
+    }
+    return networks_list;
+}
+
+static ScenarioGraph buildGraph(const std::vector<std::vector<Network>>& networks_list, const uint32_t delay_in_us) {
+    ScenarioGraph graph;
+    auto src = graph.makeSource();
+    std::vector<DataNode> producers = {src};
+    for (uint32_t list_idx = 0; list_idx < networks_list.size(); ++list_idx) {
+        auto& networks = networks_list[list_idx];
+        // NB: Delay if specified, will not be added to the beginning
+        // and end of the stream, ONLY between models
+        if (list_idx != 0u && delay_in_us != 0u) {
+            auto delay = graph.makeDelay(delay_in_us);
+            for (auto p : producers) {
+                graph.link(p, delay);
+            }
+            producers = {delay.out()};
+        }
+        std::vector<DataNode> curr_outs;
+        curr_outs.reserve(networks.size());
+        for (uint32_t net_idx = 0; net_idx < networks.size(); ++net_idx) {
+            auto infer = graph.makeInfer(networks[net_idx].tag);
+            for (auto p : producers) {
+                graph.link(p, infer);
+            }
+            curr_outs.push_back(infer.out());
+        }
+        producers = std::move(curr_outs);
+    }
+    return graph;
+}
+
+static InferenceParams adjustParams(OpenVINOParams&& params, const GlobalOptions& opts, const ReplaceBy& replace_by) {
+    // NB: Adjust the model path according to base directories provided for blobs & models
+    auto& path = params.path;
+    if (std::holds_alternative<OpenVINOParams::ModelPath>(path)) {
+        auto& model_path = std::get<OpenVINOParams::ModelPath>(path);
+        fs::path model_file_path{model_path.model};
+        fs::path bin_file_path{model_path.bin};
+        if (model_file_path.is_relative()) {
+            model_path.model = (opts.model_dir / model_file_path).string();
+        }
+        if (!model_path.bin.empty() && bin_file_path.is_relative()) {
+            model_path.bin = (opts.model_dir / bin_file_path).string();
+        }
+    } else {
+        ASSERT(std::holds_alternative<OpenVINOParams::BlobPath>(path));
+        auto& blob_path = std::get<OpenVINOParams::BlobPath>(path);
+        fs::path blob_file_path{blob_path.blob};
+        if (blob_file_path.is_relative()) {
+            blob_path.blob = (opts.blob_dir / blob_file_path).string();
+        }
+    }
+    // NB: Adjust device property based on opts.device_name or replace_by
+
+    if (!replace_by.device.empty()) {
+        // NB: ReplaceBy has priority - overwrite
+        params.device = replace_by.device;
+    } else if (params.device.empty()) {
+        // NB: Otherwise, if empty - take the value from global device name
+        params.device = opts.device_name;
+    }
+
+    // NB: Compiler type is only relevant for NPU device
+    if (params.device == "NPU") {
+        // NB: Don't overwrite compiler type if it already has been
+        // specified explicitly for particular model
+        if (const auto it = params.config.find("NPU_COMPILER_TYPE"); it == params.config.end()) {
+            params.config.emplace("NPU_COMPILER_TYPE", opts.compiler_type);
+        }
+    }
+    return std::move(params);
+}
+
+static InferenceParams adjustParams(ONNXRTParams&& params, const GlobalOptions& opts) {
+    fs::path model_file_path{params.model_path};
+    if (model_file_path.is_relative()) {
+        params.model_path = (opts.model_dir / model_file_path).string();
+    }
+    return std::move(params);
+}
+
+static InferenceParams adjustParams(InferenceParams&& params, const GlobalOptions& opts, const ReplaceBy& replace_by) {
+    if (std::holds_alternative<OpenVINOParams>(params)) {
+        return adjustParams(std::get<OpenVINOParams>(std::move(params)), opts, replace_by);
+    }
+    ASSERT(std::holds_alternative<ONNXRTParams>(params));
+    return adjustParams(std::get<ONNXRTParams>(std::move(params)), opts);
+}
+
+static StreamDesc parseStream(const YAML::Node& node, const GlobalOptions& opts, const std::string& default_name,
+                              const ReplaceBy& replace_by) {
+    StreamDesc stream;
+
+    // FIXME: Create a function for the duplicate code below
+    stream.name = node["name"] ? node["name"].as<std::string>() : default_name;
+    stream.frames_interval_in_us = 0u;
+    if (node["frames_interval_in_ms"]) {
+        stream.frames_interval_in_us = node["frames_interval_in_ms"].as<uint32_t>() * 1000u;
+        if (node["target_fps"]) {
+            THROW_ERROR("Both \"target_fps\" and \"frames_interval_in_ms\" are defined for the stream: \""
+                        << stream.name << "\"! Please specify only one of them as they are mutually exclusive.");
+        }
+    } else if (node["target_fps"]) {
+        uint32_t target_fps = node["target_fps"].as<uint32_t>();
+        stream.frames_interval_in_us = (target_fps != 0) ? (1000u * 1000u / target_fps) : 0;
+    }
+
+    if (node["target_latency_in_ms"]) {
+        stream.target_latency = std::make_optional(node["target_latency_in_ms"].as<double>());
+        if (stream.target_latency < 0) {
+            THROW_ERROR("\"target_latency_in_ms\" is negative for the stream: \"" << stream.name << "\"!");
+        }
+    }
+    if (node["exec_time_in_secs"]) {
+        const auto exec_time_in_secs = node["exec_time_in_secs"].as<uint64_t>();
+        stream.criterion = std::make_shared<TimeOut>(exec_time_in_secs * 1'000'000);
+    }
+    if (node["iteration_count"]) {
+        const auto iteration_count = node["iteration_count"].as<uint64_t>();
+        stream.criterion = std::make_shared<Iterations>(iteration_count);
+    }
+
+    auto networks_list = parseNetworks(node["network"]);
+    const auto delay_in_us = node["delay_in_us"] ? node["delay_in_us"].as<uint32_t>() : 0u;
+    stream.graph = buildGraph(networks_list, delay_in_us);
+    // NB: Collect network parameters
+    for (auto& networks : networks_list) {
+        for (auto& network : networks) {
+            stream.metrics_map.emplace(network.tag, std::move(network.accuracy_metrics));
+            stream.initializers_map.emplace(network.tag, std::move(network.initializers));
+            stream.input_data_map.emplace(network.tag, std::move(network.input_data));
+            stream.output_data_map.emplace(network.tag, std::move(network.output_data));
+            stream.infer_params_map.emplace(network.tag, adjustParams(std::move(network.params), opts, replace_by));
+        }
+    }
+    return stream;
+}
+
+using DependencyMap = std::unordered_map<std::string, std::unordered_set<std::string>>;
+
+static ScenarioGraph buildGraph(const std::vector<OpDesc>& op_descs,
+                                const std::vector<std::vector<std::string>>& connections) {
+    // NB: Build the graph based on list of operations and connections between them
+    //
+    // The algorithm is straightforward:
+    // 1) For every operation create corresponding graph node
+    // 2) Go though connections and create the dependency map
+    // 3) Go through every operation and connect with its dependencies
+    //   3.1) If operation has no dependencies, connect it directly with the source
+
+    // NB: For the fast access to operation node by name
+    std::unordered_map<std::string, OpNode> op_node_map;
+    // NB: To store the list of dependencies for every operation
+    std::unordered_map<std::string, std::unordered_set<std::string>> dependency_map;
+
+    // (1) For every operation create corresponding graph node
+    ScenarioGraph graph;
+    for (const auto& desc : op_descs) {
+        // NB: Initialize dependency list for every operation
+        dependency_map[desc.tag];
+        // FIXME: Implement visitor
+        if (std::holds_alternative<InferOp>(desc.op)) {
+            op_node_map.emplace(desc.tag, graph.makeInfer(desc.tag));
+        } else if (std::holds_alternative<CompoundOp>(desc.op)) {
+            const auto& compound = std::get<CompoundOp>(desc.op);
+            op_node_map.emplace(
+                    desc.tag, graph.makeCompound(compound.repeat_count, compound.subgraph, compound.params, desc.tag));
+        } else {
+            ASSERT(std::holds_alternative<CPUOp>(desc.op));
+            const auto& cpu = std::get<CPUOp>(desc.op);
+            op_node_map.emplace(desc.tag, graph.makeDelay(cpu.time_in_us));
+        }
+    }
+
+    // (2) Go though connections and create the dependency map
+    for (const auto& tags : connections) {
+        if (tags.size() < 2) {
+            THROW_ERROR("Connections list must be at least size of 2!");
+        }
+        for (uint32_t i = 1; i < tags.size(); ++i) {
+            // [A, B, C] - means B depends on A, and C depends on B
+            auto deps_it = dependency_map.find(tags[i]);
+            if (deps_it == dependency_map.end()) {
+                THROW_ERROR("Operation \"" << tags[i] << "\" hasn't been registered in op_desc list!");
+            }
+            if (tags[i - 1] == tags[i]) {
+                THROW_ERROR("Operation \"" << tags[i] << "\" cannot be connected with itself!");
+            }
+            auto& dep_set = deps_it->second;
+            // NB: Check if such connection already exists
+            auto is_inserted = deps_it->second.emplace(tags[i - 1]).second;
+            if (!is_inserted) {
+                THROW_ERROR("Connection between \"" << tags[i - 1] << "\" and \"" << tags[i]
+                                                    << "\" operations already exists!");
+            }
+        }
+    }
+
+    // (3) Go through every operation and connect with its dependencies
+    auto src = graph.makeSource();
+    for (const auto& [tag, deps] : dependency_map) {
+        auto op = op_node_map.at(tag);
+        // (3.1) If operation has no dependencies, connect it directly to the source
+        if (deps.empty()) {
+            graph.link(src, op);
+        } else {
+            for (auto dep_tag : deps) {
+                auto dep = op_node_map.at(dep_tag);
+                graph.link(dep.out(), op);
+            }
+        }
+    }
+    return graph;
+}
+
+static StreamDesc parseAdvancedStream(const YAML::Node& node, const GlobalOptions& opts,
+                                      const std::string& default_name, const ReplaceBy& replace_by) {
+    StreamDesc stream;
+
+    // FIXME: Create a function for the duplicate code below
+    stream.name = node["name"] ? node["name"].as<std::string>() : default_name;
+    stream.frames_interval_in_us = 0u;
+    if (node["frames_interval_in_ms"]) {
+        stream.frames_interval_in_us = node["frames_interval_in_ms"].as<uint32_t>() * 1000u;
+        if (node["target_fps"]) {
+            THROW_ERROR("Both \"target_fps\" and \"frames_interval_in_ms\" are defined for the stream: \""
+                        << stream.name << "\"! Please specify only one of them as they are mutually exclusive.");
+        }
+    } else if (node["target_fps"]) {
+        uint32_t target_fps = node["target_fps"].as<uint32_t>();
+        stream.frames_interval_in_us = (target_fps != 0) ? (1000u * 1000u / target_fps) : 0;
+    }
+
+    if (node["target_latency_in_ms"]) {
+        stream.target_latency = std::make_optional(node["target_latency_in_ms"].as<double>());
+        if (stream.target_latency < 0) {
+            THROW_ERROR("\"target_latency_in_ms\" is negative for the stream: \"" << stream.name << "\"!");
+        }
+    }
+    if (node["exec_time_in_secs"]) {
+        const auto exec_time_in_secs = node["exec_time_in_secs"].as<uint64_t>();
+        stream.criterion = std::make_shared<TimeOut>(exec_time_in_secs * 1'000'000);
+    }
+    if (node["iteration_count"]) {
+        const auto iteration_count = node["iteration_count"].as<uint64_t>();
+        stream.criterion = std::make_shared<Iterations>(iteration_count);
+    }
+
+    auto op_descs = node["op_desc"].as<std::vector<OpDesc>>();
+    std::vector<std::vector<std::string>> connections;
+    if (node["connections"]) {
+        connections = node["connections"].as<std::vector<std::vector<std::string>>>();
+    }
+
+    for (auto& desc : op_descs) {
+        if (std::holds_alternative<InferOp>(desc.op)) {
+            auto&& infer = std::get<InferOp>(desc.op);
+            stream.metrics_map.emplace(desc.tag, std::move(infer.accuracy_metrics));
+            stream.initializers_map.emplace(desc.tag, std::move(infer.initializers));
+            stream.input_data_map.emplace(desc.tag, std::move(infer.input_data));
+            stream.output_data_map.emplace(desc.tag, std::move(infer.output_data));
+            stream.infer_params_map.emplace(desc.tag, adjustParams(std::move(infer.params), opts, replace_by));
+        }
+        if (std::holds_alternative<CompoundOp>(desc.op)) {
+            auto& compound = std::get<CompoundOp>(desc.op);
+            InferenceParamsMap& params_map = compound.params;
+            for (auto& pair : params_map) {
+                pair.second = adjustParams(std::move(pair.second), opts, replace_by);
+            }
+        }
+    }
+
+    stream.graph = buildGraph(op_descs, connections);
+    return stream;
+}
+
+static std::vector<StreamDesc> parseStreams(const YAML::Node& node, const GlobalOptions& opts,
+                                            const ReplaceBy& replace_by) {
+    std::vector<StreamDesc> streams;
+    uint32_t stream_idx = 0;
+    for (const auto& subnode : node) {
+        const auto default_name = std::to_string(stream_idx);
+        auto stream = subnode["op_desc"] ? parseAdvancedStream(subnode, opts, default_name, replace_by)
+                                         : parseStream(subnode, opts, default_name, replace_by);
+        streams.push_back(std::move(stream));
+        ++stream_idx;
+    }
+    return streams;
+}
+
+static std::vector<ScenarioDesc> parseScenarios(const YAML::Node& node, const GlobalOptions& opts,
+                                                const ReplaceBy& replace_by) {
+    std::vector<ScenarioDesc> scenarios;
+    for (const auto& subnode : node) {
+        ScenarioDesc scenario;
+        scenario.name = subnode["name"] ? subnode["name"].as<std::string>()
+                                        : "multi_inference_" + std::to_string(scenarios.size());
+        scenario.streams = parseStreams(subnode["input_stream_list"], opts, replace_by);
+
+        if (opts.save_validation_outputs) {
+            for (auto& stream : scenario.streams) {
+                const auto& root_path = opts.save_validation_outputs.value();
+                std::string stream_dir = "stream_" + stream.name;
+                std::filesystem::path stream_outputs_path = root_path / scenario.name / stream_dir;
+                stream.per_iter_outputs_path = std::make_optional(std::move(stream_outputs_path));
+            }
+        }
+        scenarios.push_back(std::move(scenario));
+    }
+    return scenarios;
+}
+
+Config parseConfig(const YAML::Node& node, const ReplaceBy& replace_by) {
+    const auto global_opts = node.as<GlobalOptions>();
+
+    // FIXME: Perhaps should be done somewhere else...
+    Logger::global_lvl = toLogLevel(global_opts.log_level);
+
+    Config config;
+    config.scenarios = parseScenarios(node["multi_inference"], global_opts, replace_by);
+
+    ASSERT(!config.scenarios.empty());
+    if (node["metric"]) {
+        config.metric = node["metric"].as<IAccuracyMetric::Ptr>();
+    }
+    if (node["random"]) {
+        config.initializer = node["random"].as<IRandomGenerator::Ptr>();
+    }
+
+    config.disable_high_resolution_timer = false;
+    if (node["disable_high_resolution_waitable_timer"]) {
+        config.disable_high_resolution_timer = node["disable_high_resolution_waitable_timer"].as<bool>();
+    }
+    return config;
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/parser/config.hpp b/src/plugins/intel_npu/tools/protopipe/src/parser/config.hpp
new file mode 100644
index 00000000000000..1dec64ece423b6
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/parser/config.hpp
@@ -0,0 +1,12 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "parser/parser.hpp"
+
+#include <yaml-cpp/yaml.h>
+
+Config parseConfig(const YAML::Node& root, const ReplaceBy& replace_by);
diff --git a/src/plugins/intel_npu/tools/protopipe/src/parser/parser.cpp b/src/plugins/intel_npu/tools/protopipe/src/parser/parser.cpp
new file mode 100644
index 00000000000000..b4f48b7415615c
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/parser/parser.cpp
@@ -0,0 +1,20 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "parser/parser.hpp"
+#include "parser/config.hpp"
+
+#include "utils/error.hpp"
+
+#include <yaml-cpp/yaml.h>
+
+ScenarioParser::ScenarioParser(const std::string& filepath): m_filepath(filepath) {
+}
+
+Config ScenarioParser::parseScenarios(const ReplaceBy& replace_by) {
+    const auto root = YAML::LoadFile(m_filepath);
+    // TODO: Extend to any other config syntax
+    return parseConfig(root, replace_by);
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/parser/parser.hpp b/src/plugins/intel_npu/tools/protopipe/src/parser/parser.hpp
new file mode 100644
index 00000000000000..ec228ee8070fd3
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/parser/parser.hpp
@@ -0,0 +1,61 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <filesystem>
+#include <string>
+#include <vector>
+
+#include "scenario/criterion.hpp"
+#include "scenario/inference.hpp"
+#include "scenario/scenario_graph.hpp"
+
+struct StreamDesc {
+    // NB: Commons parameters for all modes
+    std::string name;
+    uint64_t frames_interval_in_us;
+    ScenarioGraph graph;
+    InferenceParamsMap infer_params_map;
+    ITermCriterion::Ptr criterion;
+    // Mode specific params
+    ModelsAttrMap<IAccuracyMetric::Ptr> metrics_map;
+    ModelsAttrMap<IRandomGenerator::Ptr> initializers_map;
+    ModelsAttrMap<std::string> input_data_map;
+    ModelsAttrMap<std::string> output_data_map;
+    std::optional<double> target_latency;
+    std::optional<std::filesystem::path> per_iter_outputs_path;
+};
+
+struct ScenarioDesc {
+    std::string name;
+    std::vector<StreamDesc> streams;
+    bool disable_high_resolution_timer;
+};
+
+struct Config {
+    IRandomGenerator::Ptr initializer;
+    IAccuracyMetric::Ptr metric;
+    bool disable_high_resolution_timer;
+    std::vector<ScenarioDesc> scenarios;
+};
+
+struct ReplaceBy {
+    std::string device;
+};
+
+struct IScenarioParser {
+    virtual Config parseScenarios(const ReplaceBy& replace_by) = 0;
+    virtual ~IScenarioParser() = default;
+};
+
+class ScenarioParser : public IScenarioParser {
+public:
+    ScenarioParser(const std::string& filepath);
+    Config parseScenarios(const ReplaceBy& replace_by) override;
+
+private:
+    std::string m_filepath;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/result.cpp b/src/plugins/intel_npu/tools/protopipe/src/result.cpp
new file mode 100644
index 00000000000000..23c6c315eaf123
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/result.cpp
@@ -0,0 +1,22 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "result.hpp"
+#include "utils/error.hpp"
+
+Result::Result(const Error& error): m_status(error){};
+Result::Result(const Success& success): m_status(success){};
+
+Result::operator bool() const {
+    return std::holds_alternative<Success>(m_status);
+}
+
+std::string Result::str() const {
+    if (std::holds_alternative<Success>(m_status)) {
+        return std::get<Success>(m_status).msg;
+    }
+    ASSERT(std::holds_alternative<Error>(m_status));
+    return std::get<Error>(m_status).reason;
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/result.hpp b/src/plugins/intel_npu/tools/protopipe/src/result.hpp
new file mode 100644
index 00000000000000..08cbd7b06fc940
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/result.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <variant>
+
+struct Success {
+    std::string msg;
+};
+struct Error {
+    std::string reason;
+};
+
+class Result {
+public:
+    Result() = default;  // monostate (empty)
+    Result(const Error& error);
+    Result(const Success& success);
+
+    operator bool() const;
+    std::string str() const;
+
+private:
+    using Status = std::variant<std::monostate, Error, Success>;
+    Status m_status;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/accuracy_metrics.cpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/accuracy_metrics.cpp
new file mode 100644
index 00000000000000..9f779b8dab8cfd
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/accuracy_metrics.cpp
@@ -0,0 +1,121 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "scenario/accuracy_metrics.hpp"
+
+#include "utils/error.hpp"
+
+Norm::Norm(const double tolerance): m_tolerance(tolerance){};
+
+Result Norm::compare(const cv::Mat& lhs, const cv::Mat& rhs) {
+    cv::Mat lhsf32, rhsf32;
+    lhs.convertTo(lhsf32, CV_32F);
+    rhs.convertTo(rhsf32, CV_32F);
+
+    ASSERT(lhsf32.total() == rhsf32.total());
+    auto value = cv::norm(lhsf32, rhsf32);
+
+    if (value > m_tolerance) {
+        std::stringstream ss;
+        ss << value << " > " << m_tolerance;
+        return Error{ss.str()};
+    }
+    return Success{};
+}
+
+std::string Norm::str() {
+    std::stringstream ss;
+    ss << "Norm{tolerance: " << m_tolerance << "}";
+    return ss.str();
+}
+
+Cosine::Cosine(const double threshold): m_threshold(threshold){};
+
+Result Cosine::compare(const cv::Mat& lhs, const cv::Mat& rhs) {
+    cv::Mat lhsf32, rhsf32;
+    lhs.convertTo(lhsf32, CV_32F);
+    rhs.convertTo(rhsf32, CV_32F);
+
+    ASSERT(lhsf32.total() == rhsf32.total());
+    const auto* lhsptr = lhsf32.ptr<float>();
+    const auto* rhsptr = rhsf32.ptr<float>();
+
+    double lhsdot = 0.0, rhsdot = 0.0, numr = 0.0;
+    for (size_t i = 0; i < lhsf32.total(); ++i) {
+        numr += lhsptr[i] * rhsptr[i];
+        lhsdot += lhsptr[i] * lhsptr[i];
+        rhsdot += rhsptr[i] * rhsptr[i];
+    }
+
+    const double eps = 1e-9;
+    if (lhsdot < eps || rhsdot < eps) {
+        return Error{"Division by zero!"};
+    }
+
+    const double similarity = numr / (std::sqrt(lhsdot) * std::sqrt(rhsdot));
+    if (similarity > (1.0 + eps) || similarity < -(1.0 + eps)) {
+        std::stringstream ss;
+        ss << "Invalid result " << similarity << " (valid range [-1 : +1])";
+        return Error{ss.str()};
+    }
+
+    if (m_threshold - eps > similarity) {
+        std::stringstream ss;
+        ss << similarity << " < " << m_threshold;
+        return Error{ss.str()};
+    }
+    return Success{};
+}
+
+std::string Cosine::str() {
+    std::stringstream ss;
+    ss << "Cosine{threshold: " << m_threshold << "}";
+    return ss.str();
+}
+
+NRMSE::NRMSE(const double tolerance): m_tolerance(tolerance){};
+
+Result NRMSE::compare(const cv::Mat& lhs, const cv::Mat& rhs) {
+    cv::Mat lhsf32, rhsf32;
+    lhs.convertTo(lhsf32, CV_32F);
+    rhs.convertTo(rhsf32, CV_32F);
+
+    const auto size = lhsf32.total();
+    if (size == 0) {
+        std::stringstream ss;
+        ss << "Empty output and reference tensors, nrmse loss set to 0" << std::endl;
+        return Success{};
+    }
+
+    const auto* lhsptr = lhsf32.ptr<float>();
+    const auto* rhsptr = rhsf32.ptr<float>();
+
+    double error = 0.0;
+    float lhsmax = 0.0, rhsmax = 0.0, lhsmin = 0.0, rhsmin = 0.0;
+
+    for (size_t i = 0; i < size; ++i) {
+        const auto diff = lhsptr[i] - rhsptr[i];
+        error += diff * diff;
+        lhsmax = std::max(lhsptr[i], lhsmax);
+        rhsmax = std::max(rhsptr[i], rhsmax);
+        lhsmin = std::min(lhsptr[i], lhsmin);
+        rhsmin = std::min(rhsptr[i], rhsmin);
+    }
+
+    double nrmse = sqrt(error / size) / std::max(0.001f, std::max(lhsmax - lhsmin, rhsmax - rhsmin));
+
+    if (m_tolerance < nrmse) {
+        std::stringstream ss;
+        ss << nrmse << " > " << m_tolerance;
+        return Error{ss.str()};
+    }
+    return Success{};
+}
+
+std::string NRMSE::str() {
+    std::stringstream ss;
+    ss << "nrmse{tolerance: " << m_tolerance << "}";
+    return ss.str();
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/accuracy_metrics.hpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/accuracy_metrics.hpp
new file mode 100644
index 00000000000000..010039360ecb9b
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/accuracy_metrics.hpp
@@ -0,0 +1,52 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <opencv2/core/core.hpp>
+#include <string>
+
+#include "result.hpp"
+
+struct IAccuracyMetric {
+    using Ptr = std::shared_ptr<IAccuracyMetric>;
+    virtual Result compare(const cv::Mat& lhs, const cv::Mat& rhs) = 0;
+    virtual std::string str() = 0;
+    virtual ~IAccuracyMetric() = default;
+};
+
+class Norm : public IAccuracyMetric {
+public:
+    using Ptr = std::shared_ptr<Norm>;
+    explicit Norm(const double tolerance);
+    Result compare(const cv::Mat& lhs, const cv::Mat& rhs) override;
+    std::string str() override;
+
+private:
+    double m_tolerance;
+};
+
+class Cosine : public IAccuracyMetric {
+public:
+    using Ptr = std::shared_ptr<Cosine>;
+    explicit Cosine(const double threshold);
+    Result compare(const cv::Mat& lhs, const cv::Mat& rhs) override;
+    std::string str() override;
+
+private:
+    double m_threshold;
+};
+
+class NRMSE : public IAccuracyMetric {
+public:
+    using Ptr = std::shared_ptr<NRMSE>;
+    explicit NRMSE(const double tolerance);
+    Result compare(const cv::Mat& lhs, const cv::Mat& rhs) override;
+    std::string str() override;
+
+private:
+    double m_tolerance;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/criterion.cpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/criterion.cpp
new file mode 100644
index 00000000000000..b348fe92e811cb
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/criterion.cpp
@@ -0,0 +1,72 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "criterion.hpp"
+
+#include <chrono>
+
+#include "utils/utils.hpp"
+
+Iterations::Iterations(uint64_t num_iters): m_num_iters(num_iters), m_counter(0) {
+}
+
+bool Iterations::check() const {
+    return m_counter != m_num_iters;
+}
+
+void Iterations::update() {
+    ++m_counter;
+}
+
+void Iterations::init() {
+    m_counter = 0;
+}
+
+ITermCriterion::Ptr Iterations::clone() const {
+    return std::make_shared<Iterations>(*this);
+}
+
+TimeOut::TimeOut(uint64_t time_in_us): m_time_in_us(time_in_us), m_start_ts(-1) {
+}
+
+bool TimeOut::check() const {
+    return utils::timestamp<std::chrono::microseconds>() - m_start_ts < m_time_in_us;
+}
+
+void TimeOut::update(){/* do nothing */};
+
+void TimeOut::init() {
+    m_start_ts = utils::timestamp<std::chrono::microseconds>();
+}
+
+ITermCriterion::Ptr TimeOut::clone() const {
+    return std::make_shared<TimeOut>(*this);
+}
+
+CombinedCriterion::CombinedCriterion(ITermCriterion::Ptr lhs, ITermCriterion::Ptr rhs): m_lhs(lhs), m_rhs(rhs) {
+}
+
+CombinedCriterion::CombinedCriterion(const CombinedCriterion& other) {
+    m_lhs = other.m_lhs->clone();
+    m_rhs = other.m_rhs->clone();
+}
+
+bool CombinedCriterion::check() const {
+    return m_lhs->check() && m_rhs->check();
+}
+
+void CombinedCriterion::update() {
+    m_lhs->update();
+    m_rhs->update();
+};
+
+void CombinedCriterion::init() {
+    m_lhs->init();
+    m_rhs->init();
+}
+
+ITermCriterion::Ptr CombinedCriterion::clone() const {
+    return std::make_shared<CombinedCriterion>(*this);
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/criterion.hpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/criterion.hpp
new file mode 100644
index 00000000000000..28b440a7b3b0a3
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/criterion.hpp
@@ -0,0 +1,58 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+
+struct ITermCriterion {
+    using Ptr = std::shared_ptr<ITermCriterion>;
+    virtual void init() = 0;
+    virtual void update() = 0;
+    virtual bool check() const = 0;
+    virtual ITermCriterion::Ptr clone() const = 0;
+};
+
+class Iterations : public ITermCriterion {
+public:
+    Iterations(uint64_t num_iters);
+
+    void init() override;
+    void update() override;
+    bool check() const override;
+    ITermCriterion::Ptr clone() const override;
+
+private:
+    uint64_t m_num_iters;
+    uint64_t m_counter;
+};
+
+class TimeOut : public ITermCriterion {
+public:
+    TimeOut(uint64_t time_in_us);
+
+    void init() override;
+    void update() override;
+    bool check() const override;
+    ITermCriterion::Ptr clone() const override;
+
+private:
+    uint64_t m_time_in_us;
+    uint64_t m_start_ts;
+};
+
+class CombinedCriterion : public ITermCriterion {
+public:
+    CombinedCriterion(ITermCriterion::Ptr lhs, ITermCriterion::Ptr rhs);
+    CombinedCriterion(const CombinedCriterion& other);
+
+    void init() override;
+    void update() override;
+    bool check() const override;
+    ITermCriterion::Ptr clone() const override;
+
+private:
+    ITermCriterion::Ptr m_lhs, m_rhs;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.cpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.cpp
new file mode 100644
index 00000000000000..c1648f3755cbfd
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.cpp
@@ -0,0 +1,17 @@
+//
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "scenario/inference.hpp"
+
+#include <algorithm>
+#include <iterator>
+
+std::vector<std::string> extractLayerNames(const std::vector<LayerInfo>& layers) {
+    std::vector<std::string> names;
+    std::transform(layers.begin(), layers.end(), std::back_inserter(names), [](const auto& layer) {
+        return layer.name;
+    });
+    return names;
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp
new file mode 100644
index 00000000000000..c4fd85aa26721a
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp
@@ -0,0 +1,111 @@
+//
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <variant>
+#include <vector>
+
+template <typename T>
+using AttrMap = std::map<std::string, T>;
+// NB: This type is supposed to be used to hold in/out layers
+// attributes such as precision, layout, shape etc.
+//
+// User can provide attributes either:
+// 1. std::monostate - No value specified explicitly.
+// 2. Attr - value specified explicitly that should be broadcasted to all layers.
+// 3. AttrMap[str->T] - map specifies value for particular layer.
+template <typename Attr>
+using LayerVariantAttr = std::variant<std::monostate, AttrMap<Attr>, Attr>;
+
+// NB: Map of model tag -> LayerVariantAttr<T>
+template <typename T>
+using ModelsAttrMap = std::unordered_map<std::string, LayerVariantAttr<T>>;
+
+struct LayerInfo {
+    std::string name;
+    std::vector<int> dims;
+    int prec;
+};
+using LayersInfo = std::vector<LayerInfo>;
+
+std::vector<std::string> extractLayerNames(const std::vector<LayerInfo>& layers);
+
+template <typename K, typename V>
+std::optional<V> lookUp(const std::map<K, V>& map, const K& key) {
+    const auto it = map.find(key);
+    if (it == map.end()) {
+        return {};
+    }
+    return std::make_optional(std::move(it->second));
+}
+
+template <typename T>
+static AttrMap<T> unpackLayerAttr(const LayerVariantAttr<T>& attr, const std::vector<std::string>& layer_names,
+                                  const std::string& attrname) {
+    AttrMap<T> attrmap;
+    if (std::holds_alternative<T>(attr)) {
+        auto value = std::get<T>(attr);
+        for (const auto& name : layer_names) {
+            attrmap.emplace(name, value);
+        }
+    } else if (std::holds_alternative<AttrMap<T>>(attr)) {
+        attrmap = std::get<AttrMap<T>>(attr);
+        std::unordered_set<std::string> layers_set{layer_names.begin(), layer_names.end()};
+        for (const auto& [name, attr] : attrmap) {
+            const auto it = layers_set.find(name);
+            if (it == layers_set.end()) {
+                throw std::logic_error("Failed to find layer \"" + name + "\" to specify " + attrname);
+            }
+        }
+    }
+    return attrmap;
+}
+
+struct OpenVINOParams {
+    struct ModelPath {
+        std::string model;
+        std::string bin;
+    };
+    struct BlobPath {
+        std::string blob;
+    };
+    using Path = std::variant<ModelPath, BlobPath>;
+
+    // NB: Mandatory parameters
+    Path path;
+    std::string device;
+    // NB: Optional parameters
+    LayerVariantAttr<int> input_precision;
+    LayerVariantAttr<int> output_precision;
+    LayerVariantAttr<std::string> input_layout;
+    LayerVariantAttr<std::string> output_layout;
+    LayerVariantAttr<std::string> input_model_layout;
+    LayerVariantAttr<std::string> output_model_layout;
+    std::map<std::string, std::string> config;
+    size_t nireq = 1u;
+};
+
+struct ONNXRTParams {
+    std::string model_path;
+    std::map<std::string, std::string> session_options;
+    // TODO: Extend for other available ONNXRT EP (e.g DML, CoreML, TensorRT, etc)
+    struct OpenVINO {
+        std::map<std::string, std::string> params_map;
+    };
+    // NB: std::monostate stands for the default MLAS Execution provider
+    using EP = std::variant<std::monostate, OpenVINO>;
+    EP ep;
+};
+
+using InferenceParams = std::variant<std::monostate, OpenVINOParams, ONNXRTParams>;
+using InferenceParamsMap = std::unordered_map<std::string, InferenceParams>;
diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/scenario_graph.cpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/scenario_graph.cpp
new file mode 100644
index 00000000000000..96984966fbc6fc
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/scenario_graph.cpp
@@ -0,0 +1,40 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "scenario/scenario_graph.hpp"
+
+DataNode::DataNode(Graph* graph, NodeHandle nh): m_nh(nh) {
+    graph->meta(nh).set(Data{});
+};
+
+OpNode::OpNode(NodeHandle nh, DataNode out_data): m_nh(nh), m_out_data(out_data) {
+}
+
+DataNode OpNode::out() {
+    return m_out_data;
+}
+
+DataNode ScenarioGraph::makeSource() {
+    NodeHandle nh = m_graph.create();
+    m_graph.meta(nh).set(Source{});
+    return DataNode(&m_graph, nh);
+}
+
+void ScenarioGraph::link(DataNode data, OpNode op) {
+    m_graph.link(data.m_nh, op.m_nh);
+}
+
+OpNode ScenarioGraph::makeInfer(const std::string& tag) {
+    return makeOp(Infer{tag});
+}
+
+OpNode ScenarioGraph::makeDelay(uint64_t time_in_us) {
+    return makeOp(Delay{time_in_us});
+}
+
+OpNode ScenarioGraph::makeCompound(uint64_t repeat_count, ScenarioGraph subgraph, InferenceParamsMap infer_params,
+                                   const std::string& tag) {
+    return makeOp(Compound{repeat_count, subgraph, infer_params, tag});
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/scenario_graph.hpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/scenario_graph.hpp
new file mode 100644
index 00000000000000..a9b6523a6be52d
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/scenario_graph.hpp
@@ -0,0 +1,102 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <functional>
+
+#include "graph.hpp"
+#include "scenario/accuracy_metrics.hpp"
+#include "scenario/inference.hpp"
+#include "utils/data_providers.hpp"
+
+struct Source {};
+struct Data {};
+
+class DataNode {
+public:
+    DataNode(Graph* graph, NodeHandle nh);
+
+private:
+    friend class ScenarioGraph;
+    NodeHandle m_nh;
+};
+
+class OpNode;
+template <>
+struct std::hash<OpNode>;
+
+class OpNode {
+public:
+    OpNode(NodeHandle nh, DataNode out_data);
+    DataNode out();
+
+private:
+    friend class ScenarioGraph;
+    friend struct std::hash<OpNode>;
+    NodeHandle m_nh;
+    DataNode m_out_data;
+};
+
+namespace std {
+template <>
+struct hash<OpNode> {
+    uint64_t operator()(const OpNode& op_node) const {
+        return std::hash<NodeHandle>()(op_node.m_nh);
+    }
+};
+}  // namespace std
+
+class ScenarioGraph {
+public:
+    DataNode makeSource();
+    OpNode makeInfer(const std::string& tag);
+    OpNode makeDelay(uint64_t time_in_us);
+    OpNode makeCompound(uint64_t repeat_count, ScenarioGraph subgraph, InferenceParamsMap infer_params,
+                        const std::string& tag);
+
+    void link(DataNode data, OpNode op);
+
+    template <typename F>
+    void pass(F&& f) {
+        f(m_graph);
+    }
+
+private:
+    template <typename Kind>
+    OpNode makeOp(Kind&& kind);
+
+private:
+    Graph m_graph;
+};
+
+struct Infer {
+    std::string tag;
+};
+
+struct Delay {
+    uint64_t time_in_us;
+};
+
+struct Compound {
+    uint64_t repeat_count;
+    ScenarioGraph subgraph;
+    InferenceParamsMap infer_params;
+    std::string tag;
+};
+
+struct Op {
+    using Kind = std::variant<Infer, Delay, Compound>;
+    Kind kind;
+};
+
+template <typename Kind>
+OpNode ScenarioGraph::makeOp(Kind&& kind) {
+    auto op_nh = m_graph.create();
+    auto out_nh = m_graph.create();
+    m_graph.meta(op_nh).set(Op{std::forward<Kind>(kind)});
+    m_graph.link(op_nh, out_nh);
+    return OpNode(op_nh, DataNode(&m_graph, out_nh));
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/computation.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/computation.cpp
new file mode 100644
index 00000000000000..ad0abc7fe89f9b
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/computation.cpp
@@ -0,0 +1,42 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "simulation/computation.hpp"
+
+Computation::Computation(cv::GComputation&& comp, cv::GCompileArgs&& args, std::vector<Meta>&& metas, GraphDesc&& desc)
+        : m_comp(std::move(comp)),
+          m_compile_args(std::move(args)),
+          m_out_meta(std::move(metas)),
+          m_desc(std::move(desc)) {
+}
+
+uint32_t Computation::getMaxParallelBranches() const {
+    return m_desc.max_parallel_branches;
+}
+
+const std::vector<Meta>& Computation::getOutMeta() const {
+    return m_out_meta;
+}
+
+cv::GCompiled Computation::compile(cv::GMetaArgs&& in_meta, cv::GCompileArgs&& args) {
+    auto compile_args = m_compile_args;
+    compile_args += std::move(args);
+    return m_comp.compile(std::move(in_meta), std::move(compile_args));
+}
+
+cv::GStreamingCompiled Computation::compileStreaming(cv::GMetaArgs&& in_meta, cv::GCompileArgs&& args) {
+    auto compile_args = m_compile_args;
+    compile_args += std::move(args);
+    return m_comp.compileStreaming(std::move(in_meta), std::move(compile_args));
+}
+
+cv::GMetaArgs descr_of(const std::vector<DummySource::Ptr>& sources) {
+    cv::GMetaArgs meta;
+    meta.reserve(sources.size());
+    for (auto src : sources) {
+        meta.push_back(src->descr_of());
+    }
+    return meta;
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/computation.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/computation.hpp
new file mode 100644
index 00000000000000..f9eba3b8c95a5f
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/computation.hpp
@@ -0,0 +1,36 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "graph.hpp"
+#include "simulation/dummy_source.hpp"
+
+#include <opencv2/gapi/core.hpp>
+#include <vector>
+
+class Computation {
+public:
+    // NB: Holds information about Graph structure
+    struct GraphDesc {
+        const uint32_t max_parallel_branches;
+    };
+
+    Computation(cv::GComputation&& comp, cv::GCompileArgs&& args, std::vector<Meta>&& metas, GraphDesc&& desc);
+
+    uint32_t getMaxParallelBranches() const;
+    const std::vector<Meta>& getOutMeta() const;
+
+    cv::GCompiled compile(cv::GMetaArgs&& in_meta, cv::GCompileArgs&& args = {});
+    cv::GStreamingCompiled compileStreaming(cv::GMetaArgs&& in_meta, cv::GCompileArgs&& args = {});
+
+private:
+    cv::GComputation m_comp;
+    cv::GCompileArgs m_compile_args;
+    std::vector<Meta> m_out_meta;
+    GraphDesc m_desc;
+};
+
+cv::GMetaArgs descr_of(const std::vector<DummySource::Ptr>& sources);
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/computation_builder.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/computation_builder.cpp
new file mode 100644
index 00000000000000..d43a84ef5fe3a8
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/computation_builder.cpp
@@ -0,0 +1,462 @@
+//
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "simulation/computation_builder.hpp"
+#include "simulation/layers_reader.hpp"
+#include "simulation/operations.hpp"
+#include "simulation/performance_mode.hpp"
+#include "simulation/simulation.hpp"
+
+#include "utils/error.hpp"
+
+#include <opencv2/gapi/streaming/meta.hpp>
+
+struct OpBuilder {
+    void build(NodeHandle nh, const Infer& infer);
+    void build(NodeHandle nh, const Delay& delay);
+    void build(NodeHandle nh, const Compound& compound);
+
+    Graph& graph;
+    IBuildStrategy::Ptr strategy;
+    const InferenceParamsMap& params_map;
+};
+
+void OpBuilder::build(NodeHandle nh, const Compound& compound) {
+    // Retrieving destination nodes of the current node nh
+    auto out_nhs = nh->dstNodes();
+
+    // NB: The Dummy node ensures proper handling of multiple inputs
+    auto dummy_nh = graph.create();
+    auto provider = std::make_shared<CircleBuffer>(utils::createRandom({1}, CV_8U));
+    DummyCall dummy_call{{provider}, 0};
+    graph.meta(dummy_nh).set(GOperation{std::move(dummy_call)});
+    auto in_nhs = nh->srcNodes();
+
+    // removing input edges to go through dummy node and not to compound node
+    auto src_edges = nh->srcEdges();
+    for (size_t i = 0; i < src_edges.size(); ++i) {
+        graph.remove(src_edges[i]);
+    }
+
+    for (uint32_t i = 0; i < in_nhs.size(); ++i) {
+        graph.meta(graph.link(in_nhs[i], dummy_nh)).set(InputIdx{i});  // Linking in_nhs with dummy_nh
+    }
+
+    auto dummy_out_nh = graph.create();  // Creating output dunmmy node
+    graph.meta(graph.link(dummy_nh, dummy_out_nh))
+            .set(OutputIdx{0u});  // linking dummy node handle and output dummy node handle
+    graph.meta(dummy_out_nh).set(GData{});
+    graph.meta(graph.link(dummy_out_nh, nh)).set(InputIdx{0u});
+
+    ASSERT(nh->dstEdges().size() == 1u);
+    auto dst_edge = nh->dstEdges().front();
+    graph.meta(dst_edge).set(OutputIdx{0u});
+
+    graph.meta(graph.link(nh, out_nhs.front())).set(OutputIdx{0u});
+
+    ModelsAttrMap<std::string> input_data_map;
+    ModelsAttrMap<IRandomGenerator::Ptr> initializers_map;
+
+    for (const auto& [tag, params] : compound.infer_params) {
+        input_data_map[tag];
+        initializers_map[tag];
+    }
+
+    PerformanceSimulation::Options opts{
+            nullptr,  // global_initializer
+            initializers_map,
+            input_data_map,
+            true,  // inference_only
+            {}     // target latency
+    };
+
+    Simulation::Config cfg{compound.tag,
+                           0u,     // frames_interval_in_ms
+                           false,  // disable_high_resolution_timer
+                           compound.subgraph, compound.infer_params};
+
+    auto compiled = std::make_shared<PerformanceSimulation>(std::move(cfg), std::move(opts))
+                            ->compileSync(false /*drop_frames*/);
+    auto term_criterion = std::make_shared<Iterations>(compound.repeat_count);
+    auto f = [compiled, term_criterion]() {
+        compiled->run(term_criterion);
+    };
+
+    CompoundCall compound_call{f};
+    graph.meta(nh).set(GOperation{std::move(compound_call)});
+}
+
+void OpBuilder::build(NodeHandle nh, const Delay& delay) {
+    auto in_nhs = nh->srcNodes();
+    auto out_nhs = nh->dstNodes();
+    // FIXME: Once nh is removed, delay info is no longer alive!!!
+    const auto time_in_us = delay.time_in_us;
+    graph.remove(nh);
+
+    auto delay_nh = graph.create();
+    auto provider = std::make_shared<CircleBuffer>(utils::createRandom({1}, CV_8U));
+    graph.meta(delay_nh).set(GOperation{DummyCall{{provider}, time_in_us}});
+
+    for (uint32_t i = 0; i < in_nhs.size(); ++i) {
+        graph.meta(graph.link(in_nhs[i], delay_nh)).set(InputIdx{i});
+    }
+    graph.meta(graph.link(delay_nh, out_nhs.front())).set(OutputIdx{0u});
+}
+
+void OpBuilder::build(NodeHandle nh, const Infer& infer) {
+    const auto& params = params_map.at(infer.tag);
+    auto [in_layers, out_layers] = LayersReader::readLayers(params);
+    InferDesc desc{infer.tag, std::move(in_layers), std::move(out_layers)};
+
+    auto out_nhs = nh->dstNodes();
+    ASSERT(out_nhs.size() == 1);
+
+    auto [providers, in_meta, out_meta, disable_copy] = strategy->build(desc);
+    ASSERT(providers.size() == desc.input_layers.size());
+    ASSERT(in_meta.size() == desc.input_layers.size());
+    ASSERT(out_meta.size() == desc.output_layers.size());
+
+    // NB: Check if some of the Delay's was fused to this Infer
+    uint64_t delay_in_us = 0u;
+    if (graph.meta(nh).has<Delay>()) {
+        delay_in_us = graph.meta(nh).get<Delay>().time_in_us;
+    }
+
+    auto dummy_nh = graph.create();
+    DummyCall dummy_call{providers, delay_in_us, disable_copy};
+    graph.meta(dummy_nh).set(GOperation{std::move(dummy_call)});
+    auto in_nhs = nh->srcNodes();
+    for (uint32_t i = 0; i < in_nhs.size(); ++i) {
+        graph.meta(graph.link(in_nhs[i], dummy_nh)).set(InputIdx{i});
+    }
+
+    graph.remove(nh);
+
+    auto infer_nh = graph.create();
+    for (uint32_t layer_idx = 0; layer_idx < desc.input_layers.size(); ++layer_idx) {
+        // NB: Create dummy out node and link with dummy.
+        auto dummy_out_nh = graph.create();
+        graph.meta(dummy_out_nh) += std::move(in_meta[layer_idx]);
+        graph.meta(graph.link(dummy_nh, dummy_out_nh)).set(OutputIdx{layer_idx});
+        graph.meta(dummy_out_nh).set(GData{});
+        // NB: Finally link dummy out with infer
+        graph.meta(graph.link(dummy_out_nh, infer_nh)).set(InputIdx{layer_idx});
+    }
+
+    auto out_nh = out_nhs.front();
+    graph.meta(graph.link(infer_nh, out_nh)).set(OutputIdx{0u});
+    graph.meta(out_nh) += out_meta.front();
+    for (uint32_t layer_idx = 1; layer_idx < desc.output_layers.size(); ++layer_idx) {
+        auto infer_out_nh = graph.create();
+        graph.meta(infer_out_nh) = std::move(out_meta[layer_idx]);
+        graph.meta(infer_out_nh).set(GData{});
+        graph.meta(graph.link(infer_nh, infer_out_nh)).set(OutputIdx{layer_idx});
+    }
+
+    InferCall infer_call{desc.tag, extractLayerNames(desc.input_layers), extractLayerNames(desc.output_layers)};
+    graph.meta(infer_nh).set(GOperation{std::move(infer_call)});
+};
+
+static bool fuseDelay(Graph& graph, NodeHandle nh, const Delay& delay) {
+    // NB: Current fusing is trivial and applied only for the following case:
+    // 1) Delay has only single Infer reader
+    // 2) Infer doesn't have any other writers except Delay
+    // e.g: [Delay] -> (out) -> [Infer]
+
+    // NB: Access readers of delay output data node.
+    auto delay_out_nh = nh->dstNodes().front();
+    auto out_edges = delay_out_nh->dstEdges();
+    // NB: Don't fuse Delay either if it has multiple readers
+    // or doesn't have readers at all (1)
+    if (out_edges.size() != 1u) {
+        return false;
+    }
+
+    auto out_edge = out_edges.front();
+    auto op_nh = out_edge->dstNode();
+    auto op = graph.meta(op_nh).get<Op>().kind;
+    // NB: Don't fuse Delay if reader either not an Infer (1)
+    // or it has other writers except Delay (2).
+    if (!std::holds_alternative<Infer>(op) || op_nh->srcEdges().size() != 1u) {
+        // TODO: Can be also fused to another "delay".
+        return false;
+    }
+
+    // NB: Fuse the Delay into Infer:
+    // 1) Assign Delay meta directly to Infer
+    // 2) Remove Delay node
+    // 3) Redirect Delay writers to Infer
+    graph.meta(op_nh).set(delay);
+    for (auto in_nh : nh->srcNodes()) {
+        graph.link(in_nh, op_nh);
+    }
+    graph.remove(nh);
+    graph.remove(delay_out_nh);
+
+    return true;
+}
+
+struct Protocol {
+    cv::GProtoArgs graph_inputs;
+    cv::GProtoArgs graph_outputs;
+};
+
+enum class NodeState { EXPLORING, VISITED };
+
+static void visit(NodeHandle nh, std::unordered_map<NodeHandle, NodeState>& state) {
+    auto curr_node_it = state.emplace(nh, NodeState::EXPLORING).first;
+    for (const auto& dst_nh : nh->dstNodes()) {
+        const auto dst_it = state.find(dst_nh);
+        if (dst_it == state.end()) {
+            visit(dst_nh, state);
+        } else if (dst_it->second == NodeState::EXPLORING) {
+            THROW_ERROR("Scenario graph has a cycle!");
+        }
+    }
+    curr_node_it->second = NodeState::VISITED;
+};
+
+namespace passes {
+
+// NB: Throw an exception if there is a cycle in graph
+void throwIfCycle(Graph& graph) {
+    std::unordered_map<NodeHandle, NodeState> state;
+    for (const auto& nh : graph.nodes()) {
+        if (state.find(nh) == state.end()) {
+            visit(nh, state);
+        }
+    }
+}
+
+// NB: Determines what would be the computation graph
+// inputs and outputs and marks intermediate data nodes
+void init(Graph& graph) {
+    ASSERT(!graph.nodes().empty());
+    uint32_t num_sources = 0;
+    for (auto nh : graph.nodes()) {
+        if (graph.meta(nh).has<Source>()) {
+            ++num_sources;
+            graph.meta(nh).set(GraphInput{});
+        } else {
+            // NB: Check that graph is connected
+            ASSERT(!nh->srcNodes().empty());
+        }
+        if (nh->dstNodes().empty()) {
+            ASSERT(graph.meta(nh).has<Data>());
+            graph.meta(nh).set(GraphOutput{});
+        }
+        if (!graph.meta(nh).has<Op>()) {
+            ASSERT(graph.meta(nh).has<Data>());
+            graph.meta(nh).set(GData{});
+        }
+    }
+    ASSERT(num_sources != 0);
+};
+
+// NB: Fuses delay to the inference nodes as the delay can be performed
+// as part of the model dummy preprocessing
+void fuseDelays(Graph& graph) {
+    // NB: Iterate over graph nodes until all delays are fused.
+    while (true) {
+        bool is_fused = false;
+        for (auto nh : graph.nodes()) {
+            if (!graph.meta(nh).has<Op>()) {
+                continue;
+            }
+            auto op = graph.meta(nh).get<Op>().kind;
+            if (std::holds_alternative<Delay>(op)) {
+                auto delay = std::get<Delay>(op);
+                if (fuseDelay(graph, nh, delay)) {
+                    is_fused = true;
+                    break;
+                }
+            }
+        }
+        // NB: If delay was fused, some of the nodes were removed
+        // Iterate one more time...
+        if (!is_fused) {
+            break;
+        }
+    }
+};
+
+// NB: Finds the maximum parallelism depth to tell concurrent executor
+// how many threads should be used for execution
+void findMaxParallelBranches(Graph& graph, uint32_t& max_parallel_branches) {
+    // NB: Basically the maximum parallelism in computational graph
+    // is the maximum width of its level in BFS traversal, taking into
+    // account that dependencies for the node are resolved
+    std::unordered_set<NodeHandle> curr_lvl;
+    for (auto nh : graph.nodes()) {
+        if (graph.meta(nh).has<Source>()) {
+            for (auto op_nh : nh->dstNodes()) {
+                curr_lvl.emplace(op_nh);
+            }
+        }
+    }
+
+    std::unordered_set<NodeHandle> visited;
+
+    auto get_all_deps = [&](auto nh) {
+        std::unordered_set<NodeHandle> deps;
+        for (auto in_nhs : nh->srcNodes()) {
+            for (auto op_nhs : in_nhs->srcNodes()) {
+                deps.emplace(op_nhs);
+            }
+        }
+        return deps;
+    };
+
+    auto all_deps_resolved = [&](auto nh) {
+        auto deps = get_all_deps(nh);
+        return std::all_of(deps.begin(), deps.end(), [&](auto dep) {
+            return visited.find(dep) != visited.end();
+        });
+    };
+
+    max_parallel_branches = static_cast<uint32_t>(curr_lvl.size());
+    while (!curr_lvl.empty()) {
+        std::unordered_set<NodeHandle> next_lvl;
+        for (auto nh : curr_lvl) {
+            visited.emplace(nh);
+            ASSERT(nh->dstNodes().size() == 1u);
+            auto data_nh = nh->dstNodes().front();
+            for (auto op_nh : data_nh->dstNodes()) {
+                if (all_deps_resolved(op_nh)) {
+                    next_lvl.emplace(op_nh);
+                }
+            }
+        }
+        if (next_lvl.size() > max_parallel_branches) {
+            max_parallel_branches = static_cast<uint32_t>(next_lvl.size());
+        }
+        curr_lvl = std::move(next_lvl);
+    }
+}
+
+// NB: Build "G" operations according to scenario graph nodes
+void buildOperations(Graph& graph, IBuildStrategy::Ptr strategy, const InferenceParamsMap& params_map) {
+    OpBuilder builder{graph, strategy, params_map};
+    for (auto nh : graph.nodes()) {
+        // NB: Skip data nodes
+        if (!graph.meta(nh).has<Op>()) {
+            continue;
+        }
+        std::visit(
+                [nh, &builder](const auto& op) {
+                    builder.build(nh, op);
+                },
+                graph.meta(nh).get<Op>().kind);
+    }
+
+    for (auto nh : graph.nodes()) {
+        // NB: Make sure all data nodes that needs to be
+        // dumped or validated are graph outputs.
+        if (!graph.meta(nh).has<GraphOutput>() && (graph.meta(nh).has<Validate>() || graph.meta(nh).has<Dump>())) {
+            graph.meta(nh).set(GraphOutput{});
+        }
+    }
+};
+
+void buildComputation(Graph& graph, Protocol& proto) {
+    cv::GProtoArgs graph_inputs;
+    cv::GProtoArgs graph_outputs;
+
+    std::unordered_map<NodeHandle, cv::GProtoArg> all_data;
+    auto sorted = graph.sorted();
+
+    // NB: Initialize "G" inputs
+    for (auto nh : sorted) {
+        if (graph.meta(nh).has<GraphInput>()) {
+            auto it = all_data.emplace(nh, cv::GProtoArg{cv::GMat()}).first;
+            graph_inputs.push_back(it->second);
+        }
+    }
+    // NB: Apply "G" operations in topological order
+    for (auto nh : sorted) {
+        if (graph.meta(nh).has<GOperation>()) {
+            const auto& operation = graph.meta(nh).get<GOperation>();
+            // NB: Map input args to the correct input index.
+            std::unordered_map<uint32_t, cv::GProtoArg> idx_to_arg;
+            auto in_ehs = nh->srcEdges();
+            for (auto in_eh : in_ehs) {
+                ASSERT(graph.meta(in_eh).has<InputIdx>());
+                const uint32_t in_idx = graph.meta(in_eh).get<InputIdx>().idx;
+                auto arg = all_data.at(in_eh->srcNode());
+                idx_to_arg.emplace(in_idx, arg);
+            }
+            cv::GProtoArgs in_args;
+            for (uint32_t idx = 0; idx < idx_to_arg.size(); ++idx) {
+                in_args.push_back(idx_to_arg.at(idx));
+            }
+            // NB: Link G-API operation with its io data.
+            auto out_args = operation.on(in_args);
+            // TODO: Validation in/out amount and types...
+            // NB: Map output args to the correct index.
+            auto out_ehs = nh->dstEdges();
+            for (auto out_eh : out_ehs) {
+                ASSERT(graph.meta(out_eh).has<OutputIdx>());
+                const uint32_t out_idx = graph.meta(out_eh).get<OutputIdx>().idx;
+                auto out_nh = out_eh->dstNode();
+                all_data.emplace(out_nh, out_args[out_idx]);
+            }
+        }
+    }
+
+    // NB: Collect "G" outputs
+    for (auto nh : graph.nodes()) {
+        if (graph.meta(nh).has<GraphOutput>()) {
+            graph_outputs.push_back(all_data.at(nh));
+        }
+    }
+
+    ASSERT(!graph_inputs.empty())
+    ASSERT(!graph_outputs.empty())
+    // NB: Finally save computation i/o to build GComputation later on
+    proto = Protocol{std::move(graph_inputs), std::move(graph_outputs)};
+}
+
+static void collectOutputMeta(Graph& graph, std::vector<Meta>& out_meta) {
+    for (auto nh : graph.nodes()) {
+        if (graph.meta(nh).has<GraphOutput>()) {
+            out_meta.push_back(graph.meta(nh));
+        }
+    }
+}
+
+}  // namespace passes
+
+ComputationBuilder::ComputationBuilder(IBuildStrategy::Ptr strategy): m_strategy(strategy) {
+}
+
+Computation ComputationBuilder::build(ScenarioGraph& graph, const InferenceParamsMap& infer_params,
+                                      const ComputationBuilder::Options& opts) {
+    uint32_t max_parallel_branches = 1u;
+    auto compile_args = cv::compile_args(cv::gapi::kernels<GCPUDummyM, GCPUCompound>());
+    std::vector<Meta> outputs_meta;
+    Protocol proto;
+
+    using namespace std::placeholders;
+    graph.pass(passes::throwIfCycle);
+    graph.pass(passes::init);
+    graph.pass(passes::fuseDelays);
+    graph.pass(std::bind(passes::findMaxParallelBranches, _1, std::ref(max_parallel_branches)));
+    graph.pass(std::bind(passes::buildOperations, _1, m_strategy, std::cref(infer_params)));
+    graph.pass(std::bind(passes::buildComputation, _1, std::ref(proto)));
+    graph.pass(std::bind(passes::collectOutputMeta, _1, std::ref(outputs_meta)));
+
+    if (opts.add_perf_meta) {
+        // FIXME: Must work with any G-Type!
+        ASSERT(cv::util::holds_alternative<cv::GMat>(proto.graph_outputs.front()));
+        cv::GMat g = cv::util::get<cv::GMat>(proto.graph_outputs.front());
+        proto.graph_outputs.emplace_back(cv::gapi::streaming::timestamp(g).strip());
+        proto.graph_outputs.emplace_back(cv::gapi::streaming::seq_id(g).strip());
+    }
+
+    cv::GComputation comp(cv::GProtoInputArgs{std::move(proto.graph_inputs)},
+                          cv::GProtoOutputArgs{std::move(proto.graph_outputs)});
+
+    return Computation{std::move(comp), std::move(compile_args), std::move(outputs_meta), {max_parallel_branches}};
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/computation_builder.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/computation_builder.hpp
new file mode 100644
index 00000000000000..6a51b068065284
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/computation_builder.hpp
@@ -0,0 +1,74 @@
+//
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "result.hpp"
+#include "scenario/inference.hpp"
+#include "scenario/scenario_graph.hpp"
+#include "simulation/computation.hpp"
+#include "utils/data_providers.hpp"
+
+#include <filesystem>
+#include <functional>
+#include <memory>
+
+struct InputIdx {
+    uint32_t idx;
+};
+
+struct OutputIdx {
+    uint32_t idx;
+};
+
+struct GraphInput {};
+struct GraphOutput {};
+struct GData {};
+struct GOperation {
+    using F = std::function<cv::GProtoArgs(const cv::GProtoArgs&)>;
+    F on;
+};
+
+struct Dump {
+    std::filesystem::path path;
+};
+
+struct Validate {
+    using F = std::function<Result(const cv::Mat& lhs, const cv::Mat& rhs)>;
+    F validator;
+    std::vector<cv::Mat> reference;
+};
+
+struct InferDesc {
+    std::string tag;
+    LayersInfo input_layers;
+    LayersInfo output_layers;
+};
+
+struct IBuildStrategy {
+    using Ptr = std::shared_ptr<IBuildStrategy>;
+    struct InferBuildInfo {
+        std::vector<IDataProvider::Ptr> providers;
+        std::vector<Meta> inputs_meta;
+        std::vector<Meta> outputs_meta;
+        const bool disable_copy;
+    };
+    // NB: Extend for any further node types needed
+    virtual InferBuildInfo build(const InferDesc& infer) = 0;
+};
+
+class ComputationBuilder {
+public:
+    explicit ComputationBuilder(IBuildStrategy::Ptr strategy);
+
+    struct Options {
+        bool add_perf_meta;
+    };
+
+    Computation build(ScenarioGraph& graph, const InferenceParamsMap& infer_params, const Options& opts);
+
+private:
+    IBuildStrategy::Ptr m_strategy;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/dummy_source.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/dummy_source.cpp
new file mode 100644
index 00000000000000..3b10767b34135f
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/dummy_source.cpp
@@ -0,0 +1,89 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "dummy_source.hpp"
+
+#include <opencv2/gapi/streaming/meta.hpp>
+
+#include "utils/utils.hpp"
+
+DummySource::DummySource(const uint64_t frames_interval_in_us, const bool drop_frames,
+                         const bool disable_high_resolution_timer)
+        // NB: 0 is special value means no limit fps for source.
+        : m_latency_in_us(frames_interval_in_us),
+          m_drop_frames(drop_frames),
+          m_timer(SleepTimer::create(disable_high_resolution_timer)),
+          // NB: Used for simulation, just return 1 byte.
+          m_mat(utils::createRandom({1}, CV_8U)) {
+}
+
+bool DummySource::pull(cv::gapi::wip::Data& data) {
+    using namespace std::chrono;
+    using namespace cv::gapi::streaming;
+    using ts_t = microseconds;
+
+    // NB: Wait m_latency_in_us before return the first frame.
+    if (m_next_tick_ts == -1) {
+        m_next_tick_ts = utils::timestamp<ts_t>() + m_latency_in_us;
+    }
+
+    int64_t curr_ts = utils::timestamp<ts_t>();
+    if (curr_ts < m_next_tick_ts) {
+        /*
+         *            curr_ts
+         *               |
+         *    ------|----*-----|------->
+         *                     ^
+         *               m_next_tick_ts
+         *
+         *
+         * NB: New frame will be produced at the m_next_tick_ts point.
+         */
+        m_timer->wait(ts_t{m_next_tick_ts - curr_ts});
+    } else if (m_latency_in_us != 0) {
+        /*
+         *                                       curr_ts
+         *                         +1         +2    |
+         *    |----------|----------|----------|----*-----|------->
+         *               ^                     ^
+         *         m_next_tick_ts ------------->
+         *
+         */
+
+        // NB: Count how many frames have been produced since last pull (m_next_tick_ts).
+        int64_t num_frames = static_cast<int64_t>((curr_ts - m_next_tick_ts) / m_latency_in_us);
+        // NB: Shift m_next_tick_ts to the nearest tick before curr_ts.
+        m_next_tick_ts += num_frames * m_latency_in_us;
+        // NB: if drop_frames is enabled, update current seq_id and wait for the next tick, otherwise
+        // return last written frame (+2 at the picture above) immediately.
+        if (m_drop_frames) {
+            // NB: Shift tick to the next frame.
+            m_next_tick_ts += m_latency_in_us;
+            // NB: Wait for the next frame.
+            m_timer->wait(ts_t{m_next_tick_ts - curr_ts});
+            // NB: Drop already produced frames + update seq_id for the current.
+            m_curr_seq_id += num_frames + 1;
+        }
+    }
+    // NB: Just increase reference counter not to release mat memory
+    // after assigning it to the data.
+    cv::Mat mat = m_mat;
+
+    data.meta[meta_tag::timestamp] = utils::timestamp<ts_t>();
+    data.meta[meta_tag::seq_id] = m_curr_seq_id++;
+    data = mat;
+    m_next_tick_ts += m_latency_in_us;
+
+    return true;
+}
+
+cv::GMetaArg DummySource::descr_of() const {
+    return cv::GMetaArg{cv::descr_of(m_mat)};
+}
+
+void DummySource::reset() {
+    m_next_tick_ts = -1;
+    m_curr_seq_id = 0;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/dummy_source.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/dummy_source.hpp
new file mode 100644
index 00000000000000..304e4e7ef2f512
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/dummy_source.hpp
@@ -0,0 +1,37 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <thread>
+
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/streaming/source.hpp>  // cv::gapi::wip::IStreamSource
+
+#include "utils/timer.hpp"
+#include "utils/utils.hpp"
+
+class DummySource final : public cv::gapi::wip::IStreamSource {
+public:
+    using Ptr = std::shared_ptr<DummySource>;
+
+    explicit DummySource(const uint64_t frames_interval_in_us, const bool drop_frames,
+                         const bool disable_high_resolution_timer);
+
+    bool pull(cv::gapi::wip::Data& data) override;
+    cv::GMetaArg descr_of() const override;
+    void reset();
+
+private:
+    uint64_t m_latency_in_us;
+    bool m_drop_frames;
+    IWaitable::Ptr m_timer;
+
+    cv::Mat m_mat;
+    int64_t m_next_tick_ts = -1;
+    int64_t m_curr_seq_id = 0;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/executor.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/executor.cpp
new file mode 100644
index 00000000000000..4a0fa451dace91
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/executor.cpp
@@ -0,0 +1,66 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "executor.hpp"
+#include "utils/error.hpp"
+
+#include <chrono>
+
+PipelinedExecutor::PipelinedExecutor(cv::GStreamingCompiled&& compiled): m_compiled(std::move(compiled)) {
+}
+
+PipelinedExecutor::Output PipelinedExecutor::runLoop(cv::GRunArgs&& inputs, Callback callback,
+                                                     ITermCriterion::Ptr criterion) {
+    if (!criterion) {
+        THROW_ERROR("Termination criterion hasn't been specified!");
+    }
+
+    using namespace std::chrono;
+    using clock_t = high_resolution_clock;
+
+    m_compiled.setSource(std::move(inputs));
+    criterion->init();
+
+    const auto start_tick = clock_t::now();
+    m_compiled.start();
+    while (criterion->check()) {
+        if (!callback(m_compiled)) {
+            break;
+        }
+        criterion->update();
+    }
+    const auto end_tick = clock_t::now();
+    // NB: Some frames might be in queue just wait until they processed.
+    // They shouldn't be taken into account since execution is over.
+    m_compiled.stop();
+    return Output{static_cast<uint64_t>(duration_cast<microseconds>(end_tick - start_tick).count())};
+}
+
+SyncExecutor::SyncExecutor(cv::GCompiled&& compiled): m_compiled(std::move(compiled)) {
+}
+
+SyncExecutor::Output SyncExecutor::runLoop(Callback callback, ITermCriterion::Ptr criterion) {
+    if (!criterion) {
+        THROW_ERROR("Termination criterion hasn't been specified!");
+    }
+
+    using namespace std::chrono;
+    using clock_t = high_resolution_clock;
+
+    const auto start_tick = clock_t::now();
+    criterion->init();
+    while (criterion->check()) {
+        if (!callback(m_compiled)) {
+            break;
+        }
+        criterion->update();
+    }
+    const auto end_tick = clock_t::now();
+    return Output{static_cast<uint64_t>(duration_cast<microseconds>(end_tick - start_tick).count())};
+}
+
+void SyncExecutor::reset() {
+    m_compiled.prepareForNewStream();
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/executor.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/executor.hpp
new file mode 100644
index 00000000000000..17d32937b8ba54
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/executor.hpp
@@ -0,0 +1,42 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <opencv2/gapi/gcompiled.hpp>   // cv::GCompiled
+#include <opencv2/gapi/gstreaming.hpp>  // cv::GStreamingCompiled
+
+#include "scenario/criterion.hpp"
+
+class PipelinedExecutor {
+public:
+    explicit PipelinedExecutor(cv::GStreamingCompiled&& compiled);
+
+    struct Output {
+        uint64_t elapsed_us;
+    };
+    using Callback = std::function<bool(cv::GStreamingCompiled&)>;
+
+    Output runLoop(cv::GRunArgs&& inputs, Callback callback, ITermCriterion::Ptr criterion);
+
+private:
+    cv::GStreamingCompiled m_compiled;
+};
+
+class SyncExecutor {
+public:
+    explicit SyncExecutor(cv::GCompiled&& compiled);
+
+    struct Output {
+        uint64_t elapsed_us;
+    };
+    using Callback = std::function<bool(cv::GCompiled&)>;
+
+    Output runLoop(Callback callback, ITermCriterion::Ptr criterion);
+    void reset();
+
+private:
+    cv::GCompiled m_compiled;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_data.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_data.cpp
new file mode 100644
index 00000000000000..f3b621c68e8f99
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_data.cpp
@@ -0,0 +1,155 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "layers_data.hpp"
+
+#include <fstream>
+#include <regex>
+
+#include "utils/error.hpp"
+#include "utils/logger.hpp"
+#include "utils/utils.hpp"
+
+std::string normalizeLayerName(const std::string& layer_name) {
+    std::string normalized = layer_name;
+    std::unordered_set<char> prohibited = {'\\', '/', ':', '*', '?', '"', '<', '>'};
+    std::replace_if(
+            normalized.begin(), normalized.end(),
+            [&prohibited](char ch) {
+                return prohibited.find(ch) != prohibited.end();
+            },
+            '_');
+    return normalized;
+};
+
+std::vector<cv::Mat> uploadLayerData(const std::filesystem::path& path, const std::string& tag,
+                                     const LayerInfo& layer) {
+    if (!std::filesystem::exists(path) || !std::filesystem::is_directory(path)) {
+        THROW_ERROR("Failed to find data folder: " << path << " for model: " << tag << ", layer: " << layer.name);
+    }
+    std::string iter_file_pattern = "iter_(\\d+)\\.bin";
+    std::regex regex(iter_file_pattern);
+    std::unordered_map<int, std::filesystem::path> iter_files_map;
+    for (const auto& entry : std::filesystem::directory_iterator{path}) {
+        std::smatch match;
+        const auto& filename = entry.path().filename().string();
+        if (std::regex_match(filename, match, regex)) {
+            const auto iter_idx = std::stoi(match[1].str());
+            iter_files_map.emplace(iter_idx, entry);
+        }
+    }
+    std::vector<cv::Mat> out_mats;
+    for (int i = 0; i < iter_files_map.size(); ++i) {
+        if (auto it = iter_files_map.find(i); it != iter_files_map.end()) {
+            cv::Mat mat;
+            utils::createNDMat(mat, layer.dims, layer.prec);
+            utils::readFromBinFile(it->second.string(), mat);
+            out_mats.push_back(std::move(mat));
+        } else {
+            THROW_ERROR("Failed to find data for iteration: " << i << ", model: " << tag << ", layer: " << layer.name);
+        }
+    }
+    return out_mats;
+}
+
+using LayersDataMap = std::unordered_map<std::string, std::vector<cv::Mat>>;
+LayersDataMap uploadFromDirectory(const std::filesystem::path& path, const std::string& tag, const LayersInfo& layers) {
+    LayersDataMap layers_data;
+    for (const auto& layer : layers) {
+        auto normalized = normalizeLayerName(layer.name);
+        auto data = uploadLayerData(path / normalized, tag, layer);
+        if (data.empty()) {
+            THROW_ERROR("No iterations data found for model: " << tag << ", layer: " << layer.name);
+        }
+        LOG_INFO() << "    - Found " << data.size() << " iteration(s) for layer: " << layer.name << std::endl;
+        layers_data.emplace(layer.name, std::move(data));
+    }
+    return layers_data;
+}
+
+LayersDataMap uploadData(const std::filesystem::path& path, const std::string& tag, const LayersInfo& layers,
+                         LayersType type) {
+    ASSERT(!layers.empty());
+    const std::string kLayersTypeStr = type == LayersType::INPUT ? "input" : "output";
+    if (!std::filesystem::exists(path)) {
+        THROW_ERROR("" << path << " must exist to upload layers data!")
+    }
+    LayersDataMap layers_data;
+    if (std::filesystem::is_directory(path)) {
+        layers_data = uploadFromDirectory(path, tag, layers);
+    } else {
+        if (layers.size() > 1u) {
+            THROW_ERROR("Model: " << tag << " must have exactly one " << kLayersTypeStr
+                                  << " layer in order to upload data from: " << path);
+        }
+        const auto& layer = layers.front();
+        cv::Mat mat;
+        utils::createNDMat(mat, layer.dims, layer.prec);
+        utils::readFromBinFile(path.string(), mat);
+        LOG_INFO() << "    - Found single iteration data for model: " << tag << ", layer: " << layer.name << std::endl;
+        layers_data = {{layer.name, std::vector<cv::Mat>{mat}}};
+    }
+    // NB: layers_data can't be empty as long as layers vector is non-empty.
+    const auto kNumPerLayerIterations = layers_data.begin()->second.size();
+    // NB: All i/o layers for model must have the equal amount of data.
+    for (const auto& [layer_name, data_vec] : layers_data) {
+        if (data_vec.size() != kNumPerLayerIterations) {
+            THROW_ERROR("Model: " << tag << " has different amount of data for " << kLayersTypeStr
+                                  << " layer: " << layer_name << "(" << data_vec.size() << ") and layer: "
+                                  << layers_data.begin()->first << "(" << kNumPerLayerIterations << ")");
+        }
+    }
+    return layers_data;
+}
+
+bool isDirectory(const std::filesystem::path& path) {
+    if (std::filesystem::exists(path)) {
+        return std::filesystem::is_directory(path);
+    }
+    return path.extension().empty();
+}
+
+std::vector<IDataProvider::Ptr> createConstantProviders(LayersDataMap&& layers_data,
+                                                        const std::vector<std::string>& layer_names) {
+    std::vector<IDataProvider::Ptr> providers;
+    for (const auto& layer_name : layer_names) {
+        auto layer_data = layers_data.at(layer_name);
+        providers.push_back(std::make_shared<CircleBuffer>(std::move(layer_data)));
+    }
+    return providers;
+}
+
+std::vector<IDataProvider::Ptr> createRandomProviders(const LayersInfo& layers,
+                                                      const std::map<std::string, IRandomGenerator::Ptr>& generators) {
+    std::vector<IDataProvider::Ptr> providers;
+    for (const auto& layer : layers) {
+        auto generator = generators.at(layer.name);
+        auto provider = std::make_shared<RandomProvider>(generator, layer.dims, layer.prec);
+        LOG_INFO() << "    - Random generator: " << generator->str() << " will be used for layer: " << layer.name
+                   << std::endl;
+        providers.push_back(std::move(provider));
+    }
+    return providers;
+}
+
+std::vector<std::filesystem::path> createDirectoryLayout(const std::filesystem::path& path,
+                                                         const std::vector<std::string>& layer_names) {
+    std::vector<std::filesystem::path> dirs_path;
+    std::filesystem::create_directories(path);
+    for (const auto& layer_name : layer_names) {
+        // NB: Use normalized layer name to create dir
+        // to store reference data for particular layer.
+        std::filesystem::path curr_dir = path / normalizeLayerName(layer_name);
+        dirs_path.push_back(curr_dir);
+        std::filesystem::create_directory(curr_dir);
+        {
+            // NB: Save the original layer name;
+            std::ofstream file{curr_dir / "layer_name.txt"};
+            ASSERT(file.is_open());
+            file << layer_name;
+        }
+    }
+    return dirs_path;
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_data.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_data.hpp
new file mode 100644
index 00000000000000..6d2b9bc6716212
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_data.hpp
@@ -0,0 +1,57 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <filesystem>
+
+#include "scenario/inference.hpp"
+#include "utils/data_providers.hpp"
+
+std::string normalizeLayerName(const std::string& layer_name);
+std::vector<cv::Mat> uploadLayerData(const std::filesystem::path& path, const std::string& tag, const LayerInfo& layer);
+
+enum class LayersType { INPUT = 0, OUTPUT };
+using LayersDataMap = std::unordered_map<std::string, std::vector<cv::Mat>>;
+LayersDataMap uploadFromDirectory(const std::filesystem::path& path, const std::string& tag, const LayersInfo& layers);
+
+LayersDataMap uploadData(const std::filesystem::path& path, const std::string& tag, const LayersInfo& layers,
+                         LayersType type);
+
+bool isDirectory(const std::filesystem::path& path);
+
+std::vector<IDataProvider::Ptr> createConstantProviders(LayersDataMap&& layers_data,
+                                                        const std::vector<std::string>& layer_names);
+
+std::vector<IDataProvider::Ptr> createRandomProviders(const LayersInfo& layers,
+                                                      const std::map<std::string, IRandomGenerator::Ptr>& generators);
+
+std::vector<std::filesystem::path> createDirectoryLayout(const std::filesystem::path& path,
+                                                         const std::vector<std::string>& layer_names);
+template <typename T>
+std::map<std::string, T> unpackWithDefault(const LayerVariantAttr<T>& attr, const std::vector<std::string>& layer_names,
+                                           const T& def_value) {
+    std::map<std::string, T> result;
+    if (std::holds_alternative<std::monostate>(attr)) {
+        for (const auto& layer_name : layer_names) {
+            result.emplace(layer_name, def_value);
+        }
+    } else if (std::holds_alternative<T>(attr)) {
+        auto val = std::get<T>(attr);
+        for (const auto& layer_name : layer_names) {
+            result.emplace(layer_name, val);
+        }
+    } else {
+        auto map = std::get<AttrMap<T>>(attr);
+        for (const auto& layer_name : layer_names) {
+            if (auto it = map.find(layer_name); it != map.end()) {
+                result.emplace(layer_name, it->second);
+            } else {
+                result.emplace(layer_name, def_value);
+            }
+        }
+    }
+    return result;
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_reader.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_reader.cpp
new file mode 100644
index 00000000000000..72c1e9539773e3
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_reader.cpp
@@ -0,0 +1,46 @@
+//
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "simulation/layers_reader.hpp"
+#include "scenario/inference.hpp"
+#include "utils/error.hpp"
+#include "utils/logger.hpp"
+
+OpenVINOLayersReader& getOVReader() {
+    static OpenVINOLayersReader reader;
+    return reader;
+}
+
+static std::string getModelFileName(const InferenceParams& params) {
+    if (std::holds_alternative<OpenVINOParams>(params)) {
+        const auto& ov_params = std::get<OpenVINOParams>(params);
+        if (std::holds_alternative<OpenVINOParams::ModelPath>(ov_params.path)) {
+            return std::get<OpenVINOParams::ModelPath>(ov_params.path).model;
+        } else {
+            ASSERT(std::holds_alternative<OpenVINOParams::BlobPath>(ov_params.path));
+            return std::get<OpenVINOParams::BlobPath>(ov_params.path).blob;
+        }
+    } else if (std::holds_alternative<ONNXRTParams>(params)) {
+        return std::get<ONNXRTParams>(params).model_path;
+    } else {
+        THROW_ERROR("Unsupported model parameters type!");
+    }
+    // NB: Unreachable
+    ASSERT(false);
+}
+
+InOutLayers LayersReader::readLayers(const InferenceParams& params) {
+    LOG_INFO() << "Reading model " << getModelFileName(params) << std::endl;
+    if (std::holds_alternative<OpenVINOParams>(params)) {
+        const auto& ov = std::get<OpenVINOParams>(params);
+        return getOVReader().readLayers(ov);
+    }
+    ASSERT(std::holds_alternative<ONNXRTParams>(params));
+    const auto& ort = std::get<ONNXRTParams>(params);
+    // NB: Using OpenVINO to read the i/o layers information for *.onnx model
+    OpenVINOParams ov;
+    ov.path = OpenVINOParams::ModelPath{ort.model_path, ""};
+    return getOVReader().readLayers(ov, true /* use_results_names */);
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_reader.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_reader.hpp
new file mode 100644
index 00000000000000..1d701272255fb0
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/layers_reader.hpp
@@ -0,0 +1,27 @@
+//
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "scenario/inference.hpp"
+
+#include <memory>
+
+struct InOutLayers {
+    LayersInfo in_layers;
+    LayersInfo out_layers;
+};
+
+class OpenVINOLayersReader {
+public:
+    OpenVINOLayersReader();
+    InOutLayers readLayers(const OpenVINOParams& params, const bool use_results_names = false);
+
+private:
+    class Impl;
+    std::shared_ptr<Impl> m_impl;
+};
+
+namespace LayersReader {
+InOutLayers readLayers(const InferenceParams& params);
+}  // namespace LayersReader
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/operations.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/operations.cpp
new file mode 100644
index 00000000000000..1b353dbf6e7288
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/operations.cpp
@@ -0,0 +1,131 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "simulation/operations.hpp"
+#include "utils/error.hpp"
+
+cv::GProtoArgs InferCall::operator()(const cv::GProtoArgs& inputs) {
+    cv::GInferInputs infer_inputs;
+    for (int i = 0; i < inputs.size(); ++i) {
+        auto gmat = cv::util::get<cv::GMat>(inputs[i]);
+        infer_inputs[input_names[i]] = gmat;
+    }
+    auto infer_outputs = cv::gapi::infer(tag, infer_inputs);
+    cv::GProtoArgs outputs;
+    for (int i = 0; i < output_names.size(); ++i) {
+        outputs.emplace_back(infer_outputs.at(output_names[i]));
+    }
+    return outputs;
+}
+
+std::vector<cv::GMat> GDummyM::on(const std::vector<cv::GMat>& ins, const uint64_t delay_in_us,
+                                  const std::vector<IDataProvider::Ptr>& providers, const bool disable_copy) {
+    std::vector<cv::GShape> shapes;
+    std::vector<cv::detail::OpaqueKind> op_kinds;
+    std::vector<cv::detail::HostCtor> host_ctors;
+    std::vector<cv::GArg> gargs;
+    std::vector<cv::detail::OpaqueKind> out_kinds;
+
+    gargs.emplace_back(providers);
+    gargs.emplace_back(delay_in_us);
+    gargs.emplace_back(disable_copy);
+
+    for (int i = 0; i < ins.size(); ++i) {
+        auto shape = cv::detail::GTypeTraits<cv::GMat>::shape;
+        shapes.push_back(shape);
+        auto op_kind = cv::detail::GTypeTraits<cv::GMat>::op_kind;
+        op_kinds.push_back(op_kind);
+        host_ctors.push_back(cv::detail::GObtainCtor<cv::GMat>::get());
+        gargs.emplace_back(ins[i]);
+    }
+
+    const size_t num_outputs = providers.size();
+    for (int i = 0; i < num_outputs; ++i) {
+        auto op_kind = cv::detail::GTypeTraits<cv::GMat>::op_kind;
+        out_kinds.push_back(op_kind);
+    }
+
+    using namespace std::placeholders;
+    cv::GKernel k{GDummyM::id(),
+                  "",
+                  std::bind(&GDummyM::getOutMeta, _1, _2),
+                  std::move(shapes),
+                  std::move(op_kinds),
+                  std::move(host_ctors),
+                  std::move(out_kinds)};
+
+    cv::GCall call(std::move(k));
+    call.setArgs(std::move(gargs));
+
+    std::vector<cv::GMat> outs;
+    outs.reserve(num_outputs);
+    for (int i = 0; i < num_outputs; ++i) {
+        outs.push_back(call.yield(i));
+    }
+
+    return outs;
+}
+
+cv::GMetaArgs GDummyM::getOutMeta(const cv::GMetaArgs&, const cv::GArgs& args) {
+    const auto& providers = args.front().get<std::vector<IDataProvider::Ptr>>();
+    cv::GMetaArgs out_metas;
+    out_metas.reserve(providers.size());
+    for (auto provider : providers) {
+        out_metas.emplace_back(provider->desc());
+    }
+    return out_metas;
+}
+
+cv::gapi::GBackend GCPUDummyM::backend() {
+    return cv::gapi::cpu::backend();
+}
+
+cv::GCPUKernel GCPUDummyM::kernel() {
+    return cv::GCPUKernel(&GCPUDummyM::call, &GCPUDummyM::setup);
+}
+
+void GCPUDummyM::setup(const cv::GMetaArgs& metas, cv::GArgs gargs, cv::GArg& state, const cv::GCompileArgs& args) {
+    state = cv::GArg(std::make_shared<State>());
+    auto providers = gargs.front().get<std::vector<IDataProvider::Ptr>>();
+    for (auto& provider : providers) {
+        provider->reset();
+    }
+}
+
+void GCPUDummyM::call(cv::GCPUContext& ctx) {
+    using namespace std::chrono;
+    const bool disable_copy = ctx.inArg<bool>(2u);
+    uint64_t elapsed = disable_copy ? 0u : utils::measure<microseconds>([&]() {
+        auto& providers = ctx.inArg<std::vector<IDataProvider::Ptr>>(0u);
+        for (size_t i = 0; i < providers.size(); ++i) {
+            providers[i]->pull(ctx.outMatR(static_cast<int>(i)));
+        }
+    });
+    const auto delay_in_us = ctx.inArg<uint64_t>(1u);
+    utils::busyWait(microseconds{std::max(delay_in_us - elapsed, uint64_t{0})});
+}
+
+cv::GProtoArgs DummyCall::operator()(const cv::GProtoArgs& inputs) {
+    std::vector<cv::GMat> gmats;
+    gmats.reserve(inputs.size());
+    for (auto& in : inputs) {
+        gmats.emplace_back(cv::util::get<cv::GMat>(in));
+    }
+    auto outputs = GDummyM::on(gmats, delay_in_us, providers, disable_copy);
+    cv::GProtoArgs proto_outputs;
+    for (auto& out : outputs) {
+        proto_outputs.emplace_back(cv::GProtoArg{out});
+    }
+    return proto_outputs;
+}
+
+cv::GProtoArgs CompoundCall::operator()(const cv::GProtoArgs& inputs) {
+    ASSERT(inputs.size() == 1)
+    cv::GMat in = cv::util::get<cv::GMat>(inputs[0]);
+
+    cv::GProtoArgs proto_outputs;
+    proto_outputs.emplace_back(GCompound::on(in, function));
+    return proto_outputs;
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/operations.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/operations.hpp
new file mode 100644
index 00000000000000..cce38c9d83d07f
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/operations.hpp
@@ -0,0 +1,77 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <opencv2/gapi/cpu/gcpukernel.hpp>  // GAPI_OCV_KERNEL
+#include <opencv2/gapi/gkernel.hpp>         // G_API_OP
+#include <opencv2/gapi/infer.hpp>
+
+#include "utils/data_providers.hpp"
+#include "utils/utils.hpp"
+
+// clang-format off
+struct InferCall {
+    cv::GProtoArgs operator()(const cv::GProtoArgs& inputs);
+
+    std::string              tag;
+    std::vector<std::string> input_names;
+    std::vector<std::string> output_names;
+};
+
+struct DummyState { };
+struct GDummyM {
+    static const char *id() { return "custom.dummym"; }
+    static std::vector<cv::GMat> on(const std::vector<cv::GMat>           &ins,
+                                    const uint64_t                        delay_in_us,
+                                    const std::vector<IDataProvider::Ptr> &providers,
+                                    const bool                            disable_copy);
+    static cv::GMetaArgs getOutMeta(const cv::GMetaArgs&, const cv::GArgs &args);
+};
+
+struct GCPUDummyM: public cv::detail::KernelTag {
+    using API = GDummyM;
+    using State = DummyState;
+
+    static cv::gapi::GBackend backend();
+    static cv::GCPUKernel kernel();
+    static void setup(const cv::GMetaArgs    &metas,
+                      cv::GArgs              gargs,
+                      cv::GArg               &state,
+                      const cv::GCompileArgs &args);
+    static void call(cv::GCPUContext &ctx);
+};
+
+struct DummyCall {
+    std::vector<IDataProvider::Ptr> providers;
+    uint64_t delay_in_us;
+    // NB: Don't pull data from providers if enabled
+    bool disable_copy = false;
+    cv::GProtoArgs operator()(const cv::GProtoArgs& inputs);
+};
+
+using F = std::function<void()>;
+
+G_TYPED_KERNEL(GCompound, <cv::GMat(cv::GMat, F)>, "custom.compound")
+{
+    static cv::GMatDesc outMeta(cv::GMatDesc in, F){
+        return in;
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCompound, GCompound)
+{
+    static void run(const cv::Mat& in,
+                    F function,
+                    cv::Mat& out)
+    {
+        function();
+    }
+};
+
+struct CompoundCall {
+    cv::GProtoArgs operator()(const cv::GProtoArgs& inputs);
+    F function;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/ov_layers_reader.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/ov_layers_reader.cpp
new file mode 100644
index 00000000000000..57527cef0cc4aa
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/ov_layers_reader.cpp
@@ -0,0 +1,215 @@
+//
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "simulation/layers_reader.hpp"
+
+#include <opencv2/core.hpp>  // CV_*
+#include <openvino/openvino.hpp>
+
+#include "utils/error.hpp"
+
+#include <fstream>
+
+class OpenVINOLayersReader::Impl {
+public:
+    InOutLayers readLayers(const OpenVINOParams& params, const bool use_results_names);
+
+private:
+    InOutLayers readFromBlob(const std::string& blob, const std::string& device,
+                             const std::map<std::string, std::string>& config);
+
+    InOutLayers readFromModel(const std::string& xml, const std::string& bin, const OpenVINOParams& params,
+                              const bool use_results_names);
+
+private:
+    ov::Core m_core;
+};
+
+OpenVINOLayersReader::OpenVINOLayersReader(): m_impl(new OpenVINOLayersReader::Impl{}) {
+}
+
+static ov::element::Type toElementType(int cvdepth) {
+    switch (cvdepth) {
+    case CV_8U:
+        return ov::element::u8;
+    case CV_32S:
+        return ov::element::i32;
+    case CV_32F:
+        return ov::element::f32;
+    case CV_16F:
+        return ov::element::f16;
+    }
+    throw std::logic_error("Failed to convert opencv depth to ov::element::Type");
+}
+
+static std::vector<int> toDims(const std::vector<size_t>& sz_vec) {
+    std::vector<int> result;
+    result.reserve(sz_vec.size());
+    for (auto sz : sz_vec) {
+        // FIXME: Probably requires some check...
+        result.push_back(static_cast<int>(sz));
+    }
+    return result;
+}
+
+static int toPrecision(ov::element::Type prec) {
+    switch (prec) {
+    case ov::element::u8:
+        return CV_8U;
+    case ov::element::i32:
+        return CV_32S;
+    case ov::element::f32:
+        return CV_32F;
+    case ov::element::f16:
+        return CV_16F;
+    case ov::element::i64:
+        return CV_32S;
+    }
+    throw std::logic_error("Unsupported OV precision");
+}
+
+template <typename InfoVec>
+std::vector<LayerInfo> ovToLayersInfo(const InfoVec& vec) {
+    std::vector<LayerInfo> layers;
+    layers.reserve(vec.size());
+    std::transform(vec.begin(), vec.end(), std::back_inserter(layers), [](const auto& node) {
+        return LayerInfo{node.get_any_name(), toDims(node.get_shape()), toPrecision(node.get_element_type())};
+    });
+    return layers;
+};
+
+static void cfgInputPreproc(ov::preprocess::PrePostProcessor& ppp, const std::shared_ptr<ov::Model>& model,
+                            const AttrMap<int>& input_precision, const AttrMap<std::string>& input_layout,
+                            const AttrMap<std::string>& input_model_layout) {
+    for (const auto& input : model->inputs()) {
+        const auto& name = input.get_any_name();
+        auto& ii = ppp.input(name);
+
+        const auto ip = lookUp(input_precision, name);
+        if (ip.has_value()) {
+            ii.tensor().set_element_type(toElementType(*ip));
+        }
+
+        const auto il = lookUp(input_layout, name);
+        if (il.has_value()) {
+            ii.tensor().set_layout(ov::Layout(*il));
+        }
+
+        const auto iml = lookUp(input_model_layout, name);
+        if (iml.has_value()) {
+            ii.model().set_layout(ov::Layout(*iml));
+        }
+    }
+}
+
+static void cfgOutputPostproc(ov::preprocess::PrePostProcessor& ppp, const std::shared_ptr<ov::Model>& model,
+                              const AttrMap<int>& output_precision, const AttrMap<std::string>& output_layout,
+                              const AttrMap<std::string> output_model_layout) {
+    for (const auto& output : model->outputs()) {
+        const auto& name = output.get_any_name();
+        auto& oi = ppp.output(name);
+
+        const auto op = lookUp(output_precision, name);
+        if (op.has_value()) {
+            oi.tensor().set_element_type(toElementType(*op));
+        }
+
+        const auto ol = lookUp(output_layout, name);
+        if (ol.has_value()) {
+            oi.tensor().set_layout(ov::Layout(*ol));
+        }
+
+        const auto oml = lookUp(output_model_layout, name);
+        if (oml.has_value()) {
+            oi.model().set_layout(ov::Layout(*oml));
+        }
+    }
+}
+
+static std::vector<std::string> extractLayerNames(const std::vector<ov::Output<ov::Node>>& nodes) {
+    std::vector<std::string> names;
+    std::transform(nodes.begin(), nodes.end(), std::back_inserter(names), [](const auto& node) {
+        return node.get_any_name();
+    });
+    return names;
+}
+
+InOutLayers OpenVINOLayersReader::Impl::readFromModel(const std::string& model_path, const std::string& bin_path,
+                                                      const OpenVINOParams& params, const bool use_results_names) {
+    auto model = m_core.read_model(model_path, bin_path);
+    {
+        ov::preprocess::PrePostProcessor ppp(model);
+
+        const auto& input_names = extractLayerNames(model->inputs());
+        const auto ip_map = unpackLayerAttr(params.input_precision, input_names, "input precision");
+        const auto il_map = unpackLayerAttr(params.input_layout, input_names, "input layout");
+        const auto iml_map = unpackLayerAttr(params.input_model_layout, input_names, "input model layout");
+        cfgInputPreproc(ppp, model, ip_map, il_map, iml_map);
+
+        const auto& output_names = extractLayerNames(model->outputs());
+        const auto op_map = unpackLayerAttr(params.output_precision, output_names, "output precision");
+        const auto ol_map = unpackLayerAttr(params.output_layout, output_names, "output layout");
+        const auto oml_map = unpackLayerAttr(params.output_model_layout, output_names, "output model layout");
+        cfgOutputPostproc(ppp, model, op_map, ol_map, oml_map);
+
+        model = ppp.build();
+    }
+
+    auto input_layers = ovToLayersInfo(model->inputs());
+    auto output_layers = ovToLayersInfo(model->outputs());
+
+    // FIXME: UGLY WA in order to use layer names obtained by OV reader in ONNXRT.
+    // Ideally there should be corresponding ONNXRT reader instead!!!
+    // Result nodes friendly names preserve the names from original model,
+    // so the could be used in different framework (not only OpenVINO)
+    if (use_results_names) {
+        const auto& results = model->get_results();
+        for (int i = 0; i < results.size(); ++i) {
+            auto result_name = results[i]->get_friendly_name();
+            // This suffix is hardcoded at the OpenVINO side
+            const std::string suffix = "/sink_port_0";
+            const auto kSuffixStartPos = result_name.length() - suffix.length();
+            // Check that suffix is still presented at the OpenVINO side
+            ASSERT(result_name.substr(kSuffixStartPos) == suffix);
+            // Drop the suffix as it's not needed and update the name
+            result_name = result_name.substr(0, kSuffixStartPos);
+            output_layers[i].name = result_name;
+        }
+    }
+
+    return {std::move(input_layers), std::move(output_layers)};
+}
+
+InOutLayers OpenVINOLayersReader::Impl::readFromBlob(const std::string& blob, const std::string& device,
+                                                     const std::map<std::string, std::string>& config) {
+    std::ifstream file(blob, std::ios_base::in | std::ios_base::binary);
+    if (!file.is_open()) {
+        THROW_ERROR("Failed to import model from: " << blob);
+    }
+
+    auto compiled_model = m_core.import_model(file, device, {config.begin(), config.end()});
+
+    auto input_layers = ovToLayersInfo(compiled_model.inputs());
+    auto output_layers = ovToLayersInfo(compiled_model.outputs());
+
+    return {std::move(input_layers), std::move(output_layers)};
+}
+
+InOutLayers OpenVINOLayersReader::Impl::readLayers(const OpenVINOParams& params, const bool use_results_names) {
+    if (std::holds_alternative<OpenVINOParams::ModelPath>(params.path)) {
+        const auto& path = std::get<OpenVINOParams::ModelPath>(params.path);
+        return readFromModel(path.model, path.bin, params, use_results_names);
+    }
+    ASSERT(std::holds_alternative<OpenVINOParams::BlobPath>(params.path));
+    // NB: use_results_names is WA for reading layer names for the further usage in ONNXRT
+    // since ONNXRT is always ModelPath case (*.onnx format), no need to handle this for *.blob's
+    ASSERT(!use_results_names);
+    const auto& path = std::get<OpenVINOParams::BlobPath>(params.path);
+    return readFromBlob(path.blob, params.device, params.config);
+}
+
+InOutLayers OpenVINOLayersReader::readLayers(const OpenVINOParams& params, const bool use_results_names) {
+    return m_impl->readLayers(params, use_results_names);
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/performance_mode.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/performance_mode.cpp
new file mode 100644
index 00000000000000..4e47b34e3d2d35
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/performance_mode.cpp
@@ -0,0 +1,337 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "performance_mode.hpp"
+
+#include "simulation/computation_builder.hpp"
+#include "simulation/executor.hpp"
+#include "simulation/layers_data.hpp"
+#include "utils/logger.hpp"
+#include "utils/utils.hpp"
+
+#include <opencv2/gapi/gproto.hpp>    // cv::GCompileArgs
+#include <opencv2/gapi/infer/ov.hpp>  // ov::benchmark_mode{}
+
+#include <chrono>
+
+class PerformanceMetrics {
+public:
+    PerformanceMetrics(const uint64_t elapsed, const std::vector<int64_t> latency, const std::vector<int64_t> seq_ids);
+    friend std::ostream& operator<<(std::ostream& os, const PerformanceMetrics& metrics);
+
+private:
+    // TODO: avg, min, max statistics can be encapsulated.
+    double avg_latency_ms;
+    double min_latency_ms;
+    double max_latency_ms;
+    int64_t total_frames;
+    double fps;
+    int64_t dropped;
+};
+
+PerformanceMetrics::PerformanceMetrics(const uint64_t elapsed_us, const std::vector<int64_t> latency_us,
+                                       const std::vector<int64_t> seq_ids) {
+    avg_latency_ms = utils::avg(latency_us) / 1000.0;
+    min_latency_ms = utils::min(latency_us) / 1000.0;
+    max_latency_ms = utils::max(latency_us) / 1000.0;
+    double elapsed_ms = static_cast<double>(elapsed_us / 1000.0);
+    fps = latency_us.size() / elapsed_ms * 1000;
+
+    dropped = 0;
+    int64_t prev_seq_id = seq_ids[0];
+    for (size_t i = 1; i < seq_ids.size(); ++i) {
+        dropped += seq_ids[i] - prev_seq_id - 1;
+        prev_seq_id = seq_ids[i];
+    }
+    total_frames = seq_ids.back() + 1;
+}
+
+std::ostream& operator<<(std::ostream& os, const PerformanceMetrics& metrics) {
+    os << "throughput: " << metrics.fps << " FPS, latency: min: " << metrics.min_latency_ms
+       << " ms, avg: " << metrics.avg_latency_ms << " ms, max: " << metrics.max_latency_ms
+       << " ms, frames dropped: " << metrics.dropped << "/" << metrics.total_frames;
+    return os;
+}
+
+namespace {
+
+struct InputDataVisitor {
+    InputDataVisitor(const InferDesc& _infer, const PerformanceSimulation::Options& _opts)
+            : infer(_infer), opts(_opts), providers(infer.input_layers.size()) {
+    }
+
+    void operator()(std::monostate);
+    void operator()(const std::string&);
+    void operator()(const LayerVariantAttr<std::string>&);
+
+    const InferDesc& infer;
+    const PerformanceSimulation::Options& opts;
+    std::vector<IDataProvider::Ptr> providers;
+};
+
+void InputDataVisitor::operator()(std::monostate) {
+    LOG_INFO() << "Input data path for model: " << infer.tag << " hasn't been provided. Will be generated randomly"
+               << std::endl;
+    auto initializers = opts.initializers_map.at(infer.tag);
+    auto default_initialzer =
+            opts.global_initializer ? opts.global_initializer : std::make_shared<UniformGenerator>(0.0, 255.0);
+    auto per_layer_initializers =
+            unpackWithDefault(initializers, extractLayerNames(infer.input_layers), default_initialzer);
+    providers = createRandomProviders(infer.input_layers, per_layer_initializers);
+};
+
+void InputDataVisitor::operator()(const std::string& path_str) {
+    const std::filesystem::path path{path_str};
+    if (std::filesystem::exists(path)) {
+        LOG_INFO() << "Input data path: " << path << " for model: " << infer.tag << " exists - data will be uploaded"
+                   << std::endl;
+        auto layers_data = uploadData(path, infer.tag, infer.input_layers, LayersType::INPUT);
+        providers = createConstantProviders(std::move(layers_data), extractLayerNames(infer.input_layers));
+    } else {
+        auto initializers = opts.initializers_map.at(infer.tag);
+        auto default_initialzer =
+                opts.global_initializer ? opts.global_initializer : std::make_shared<UniformGenerator>(0.0, 255.0);
+        auto per_layer_initializers =
+                unpackWithDefault(initializers, extractLayerNames(infer.input_layers), default_initialzer);
+        LOG_INFO() << "Input data path: " << path << " for model: " << infer.tag
+                   << " provided but doesn't exist - will be generated randomly" << std::endl;
+        providers = createRandomProviders(infer.input_layers, per_layer_initializers);
+    }
+}
+
+void InputDataVisitor::operator()(const LayerVariantAttr<std::string>&) {
+    THROW_ERROR("Performance mode supports input data in form of either directory or single file!");
+};
+
+}  // anonymous namespace
+
+PerformanceStrategy::PerformanceStrategy(const PerformanceSimulation::Options& _opts): opts(_opts){};
+
+IBuildStrategy::InferBuildInfo PerformanceStrategy::build(const InferDesc& infer) {
+    const auto& input_data = opts.input_data_map.at(infer.tag);
+    InputDataVisitor in_data_visitor{infer, opts};
+    std::visit(in_data_visitor, input_data);
+    // NB: No special I/O meta for this mode
+    std::vector<Meta> inputs_meta(infer.input_layers.size(), Meta{});
+    std::vector<Meta> outputs_meta(infer.output_layers.size(), Meta{});
+    return {std::move(in_data_visitor.providers), std::move(inputs_meta), std::move(outputs_meta), opts.inference_only};
+}
+
+namespace {
+
+class SyncSimulation : public SyncCompiled {
+public:
+    struct Options {
+        uint32_t after_iter_delay_in_us = 0u;
+    };
+
+    SyncSimulation(cv::GCompiled&& compiled, std::vector<DummySource::Ptr>&& sources, const size_t num_outputs,
+                   const Options& options);
+
+    Result run(ITermCriterion::Ptr criterion) override;
+
+private:
+    void reset();
+    bool process(cv::GCompiled& pipeline);
+
+    SyncExecutor m_exec;
+    std::vector<DummySource::Ptr> m_sources;
+    std::vector<cv::Mat> m_out_mats;
+    int64_t m_ts, m_seq_id;
+
+    std::vector<int64_t> m_per_iter_latency;
+    std::vector<int64_t> m_per_iter_seq_ids;
+
+    Options m_opts;
+};
+
+class PipelinedSimulation : public PipelinedCompiled {
+public:
+    PipelinedSimulation(cv::GStreamingCompiled&& compiled, std::vector<DummySource::Ptr>&& sources,
+                        const size_t num_outputs);
+
+    Result run(ITermCriterion::Ptr criterion) override;
+
+private:
+    bool process(cv::GStreamingCompiled& pipeline);
+
+    PipelinedExecutor m_exec;
+    std::vector<DummySource::Ptr> m_sources;
+    cv::optional<int64_t> m_ts, m_seq_id;
+    std::vector<cv::optional<cv::Mat>> m_opt_mats;
+
+    std::vector<int64_t> m_per_iter_latency;
+    std::vector<int64_t> m_per_iter_seq_ids;
+};
+
+//////////////////////////////// SyncSimulation ///////////////////////////////
+SyncSimulation::SyncSimulation(cv::GCompiled&& compiled, std::vector<DummySource::Ptr>&& sources,
+                               const size_t num_outputs, const SyncSimulation::Options& options)
+        : m_exec(std::move(compiled)),
+          m_sources(std::move(sources)),
+          m_out_mats(num_outputs),
+          m_ts(-1),
+          m_seq_id(-1),
+          m_opts(options) {
+    LOG_DEBUG() << "Run warm-up iteration" << std::endl;
+    this->run(std::make_shared<Iterations>(1u));
+    LOG_DEBUG() << "Warm-up has finished successfully." << std::endl;
+}
+
+void SyncSimulation::reset() {
+    for (auto src : m_sources) {
+        src->reset();
+    }
+    m_exec.reset();
+};
+
+Result SyncSimulation::run(ITermCriterion::Ptr criterion) {
+    using namespace std::placeholders;
+    auto cb = std::bind(&SyncSimulation::process, this, _1);
+    auto out = m_exec.runLoop(cb, criterion);
+    PerformanceMetrics metrics(out.elapsed_us, m_per_iter_latency, m_per_iter_seq_ids);
+    m_per_iter_latency.clear();
+    m_per_iter_seq_ids.clear();
+    std::stringstream ss;
+    ss << metrics;
+    this->reset();
+    return Success{ss.str()};
+};
+
+bool SyncSimulation::process(cv::GCompiled& pipeline) {
+    using ts_t = std::chrono::microseconds;
+    auto pipeline_outputs = cv::gout();
+    // NB: Reference is mandatory there since copying empty
+    // Mat may lead to weird side effects.
+    for (auto& out_mat : m_out_mats) {
+        pipeline_outputs += cv::gout(out_mat);
+    }
+    pipeline_outputs += cv::gout(m_ts);
+    pipeline_outputs += cv::gout(m_seq_id);
+
+    cv::GRunArgs pipeline_inputs;
+    pipeline_inputs.reserve(m_sources.size());
+    for (auto src : m_sources) {
+        cv::gapi::wip::Data data;
+        src->pull(data);
+        pipeline_inputs.push_back(std::move(data));
+    }
+    pipeline(std::move(pipeline_inputs), std::move(pipeline_outputs));
+    const auto curr_ts = utils::timestamp<ts_t>();
+    m_per_iter_latency.push_back(curr_ts - m_ts);
+    m_per_iter_seq_ids.push_back(m_seq_id);
+
+    // NB: Do extra busy wait to simulate the user's post processing after stream.
+    if (m_opts.after_iter_delay_in_us != 0) {
+        utils::busyWait(std::chrono::microseconds{m_opts.after_iter_delay_in_us});
+    }
+    return true;
+}
+
+//////////////////////////////// PipelinedSimulation ///////////////////////////////
+PipelinedSimulation::PipelinedSimulation(cv::GStreamingCompiled&& compiled, std::vector<DummySource::Ptr>&& sources,
+                                         const size_t num_outputs)
+        : m_exec(std::move(compiled)), m_sources(std::move(sources)), m_opt_mats(num_outputs) {
+    LOG_DEBUG() << "Run warm-up iteration" << std::endl;
+    this->run(std::make_shared<Iterations>(1u));
+    LOG_DEBUG() << "Warm-up has finished successfully." << std::endl;
+}
+
+Result PipelinedSimulation::run(ITermCriterion::Ptr criterion) {
+    auto pipeline_inputs = cv::gin();
+    for (auto source : m_sources) {
+        pipeline_inputs += cv::gin(static_cast<cv::gapi::wip::IStreamSource::Ptr>(source));
+    }
+
+    using namespace std::placeholders;
+    auto cb = std::bind(&PipelinedSimulation::process, this, _1);
+    auto out = m_exec.runLoop(std::move(pipeline_inputs), cb, criterion);
+    PerformanceMetrics metrics(out.elapsed_us, m_per_iter_latency, m_per_iter_seq_ids);
+    m_per_iter_latency.clear();
+    m_per_iter_seq_ids.clear();
+
+    std::stringstream ss;
+    ss << metrics;
+
+    // NB: Reset sources since they may have their state changed.
+    for (auto src : m_sources) {
+        src->reset();
+    }
+    return Success{ss.str()};
+};
+
+bool PipelinedSimulation::process(cv::GStreamingCompiled& pipeline) {
+    using ts_t = std::chrono::microseconds;
+    cv::GOptRunArgsP pipeline_outputs;
+    for (auto& opt_mat : m_opt_mats) {
+        pipeline_outputs.emplace_back(cv::gout(opt_mat)[0]);
+    }
+    pipeline_outputs.emplace_back(cv::gout(m_ts)[0]);
+    pipeline_outputs.emplace_back(cv::gout(m_seq_id)[0]);
+    const bool has_data = pipeline.pull(std::move(pipeline_outputs));
+    const auto curr_ts = utils::timestamp<ts_t>();
+    ASSERT(m_ts.has_value());
+    ASSERT(m_seq_id.has_value());
+    m_per_iter_latency.push_back(curr_ts - *m_ts);
+    m_per_iter_seq_ids.push_back(*m_seq_id);
+    return has_data;
+}
+
+}  // anonymous namespace
+
+PerformanceSimulation::PerformanceSimulation(Simulation::Config&& cfg, PerformanceSimulation::Options&& opts)
+        : Simulation(std::move(cfg)),
+          m_opts(std::move(opts)),
+          m_strategy(std::make_shared<PerformanceStrategy>(m_opts)),
+          m_comp(ComputationBuilder{m_strategy}.build(m_cfg.graph, m_cfg.params, {true /* add performance meta */})) {
+}
+
+std::shared_ptr<PipelinedCompiled> PerformanceSimulation::compilePipelined(DummySources&& sources,
+                                                                           cv::GCompileArgs&& compile_args) {
+    if (m_opts.inference_only) {
+        // TODO: Extend also for ONNXRT backend
+        compile_args += cv::compile_args(cv::gapi::wip::ov::benchmark_mode{});
+    }
+    auto compiled = m_comp.compileStreaming(descr_of(sources), std::move(compile_args));
+    return std::make_shared<PipelinedSimulation>(std::move(compiled), std::move(sources), m_comp.getOutMeta().size());
+}
+
+std::shared_ptr<SyncCompiled> PerformanceSimulation::compileSync(const bool drop_frames) {
+    auto compile_args = cv::compile_args(getNetworksPackage());
+    if (m_opts.inference_only) {
+        // TODO: Extend also for ONNXRT backend
+        compile_args += cv::compile_args(cv::gapi::wip::ov::benchmark_mode{});
+    }
+
+    const uint32_t max_parallel_branches = m_comp.getMaxParallelBranches();
+    if (max_parallel_branches > 1u) {
+        LOG_INFO() << "Found at most " << max_parallel_branches
+                   << " parallel branches in graph,"
+                      " so threaded executor will be used"
+                   << std::endl;
+        ;
+        compile_args += cv::compile_args(cv::use_threaded_executor{max_parallel_branches});
+    }
+
+    auto sources = createSources(drop_frames);
+    SyncSimulation::Options options{0u};
+    if (m_opts.target_latency.has_value()) {
+        if (!drop_frames) {
+            THROW_ERROR("Target latency for the stream is only supported when frames drop is enabled!");
+        }
+        // NB: There is no way to specify more than one source currently so assert if it happened.
+        ASSERT(sources.size() == 1u);
+        const double target_latency_in_ms = m_opts.target_latency.value();
+        const uint64_t source_latency_in_ms = m_cfg.frames_interval_in_us / 1000u;
+        if (target_latency_in_ms > source_latency_in_ms) {
+            THROW_ERROR("Target latency must be less or equal than source latency!");
+        }
+        options.after_iter_delay_in_us = static_cast<uint32_t>(source_latency_in_ms - target_latency_in_ms) * 1000u;
+    }
+
+    auto compiled = m_comp.compile(descr_of(sources), std::move(compile_args));
+    return std::make_shared<SyncSimulation>(std::move(compiled), std::move(sources), m_comp.getOutMeta().size(),
+                                            options);
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/performance_mode.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/performance_mode.hpp
new file mode 100644
index 00000000000000..16eff684c4e2de
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/performance_mode.hpp
@@ -0,0 +1,41 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+
+#include "simulation/computation.hpp"
+#include "simulation/computation_builder.hpp"
+#include "simulation/simulation.hpp"
+
+struct PerformanceStrategy;
+class PerformanceSimulation : public Simulation {
+public:
+    struct Options {
+        IRandomGenerator::Ptr global_initializer;
+        ModelsAttrMap<IRandomGenerator::Ptr> initializers_map;
+        ModelsAttrMap<std::string> input_data_map;
+        const bool inference_only;
+        std::optional<double> target_latency;
+    };
+    explicit PerformanceSimulation(Simulation::Config&& cfg, Options&& opts);
+
+    std::shared_ptr<PipelinedCompiled> compilePipelined(DummySources&& sources,
+                                                        cv::GCompileArgs&& compiler_args) override;
+    std::shared_ptr<SyncCompiled> compileSync(const bool drop_frames) override;
+
+private:
+    Options m_opts;
+    std::shared_ptr<PerformanceStrategy> m_strategy;
+    Computation m_comp;
+};
+
+struct PerformanceStrategy : public IBuildStrategy {
+    explicit PerformanceStrategy(const PerformanceSimulation::Options& opts);
+    IBuildStrategy::InferBuildInfo build(const InferDesc& infer) override;
+
+    const PerformanceSimulation::Options& opts;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/reference_mode.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/reference_mode.cpp
new file mode 100644
index 00000000000000..6eb55ee11fcc30
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/reference_mode.cpp
@@ -0,0 +1,361 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "reference_mode.hpp"
+
+#include <fstream>
+
+#include "simulation/computation_builder.hpp"
+#include "simulation/executor.hpp"
+#include "simulation/layers_data.hpp"
+#include "utils/logger.hpp"
+#include "utils/utils.hpp"
+
+#include <opencv2/gapi/gproto.hpp>  // cv::GCompileArgs
+
+namespace {
+
+struct InputDataVisitor {
+    InputDataVisitor(const InferDesc& _infer, const CalcRefSimulation::Options& _opts)
+            : infer(_infer), opts(_opts), providers(infer.input_layers.size()), metas(infer.input_layers.size()) {
+    }
+
+    void operator()(std::monostate);
+    void operator()(const std::string&);
+    void operator()(const LayerVariantAttr<std::string>&);
+
+    InferDesc infer;
+    const CalcRefSimulation::Options& opts;
+    // NB: Relevant when input reference data already exists and need to
+    // generate exactly the same amount of output data.
+    // Note that this value must be the same for all models within stream.
+    cv::util::optional<uint64_t> model_required_iterations;
+    std::vector<IDataProvider::Ptr> providers;
+    std::vector<Meta> metas;
+};
+
+void InputDataVisitor::operator()(std::monostate) {
+    THROW_ERROR("Reference mode requires output data path to be provided"
+                " in form of either directory or single file!");
+};
+
+void InputDataVisitor::operator()(const LayerVariantAttr<std::string>&) {
+    THROW_ERROR("Reference mode requires output data path to be provided"
+                " in form of either directory or single file!");
+};
+
+void InputDataVisitor::operator()(const std::string& path_str) {
+    // NB: Single path provided - either single file or directory.
+    const auto input_names = extractLayerNames(infer.input_layers);
+    const auto& initializers = opts.initializers_map.at(infer.tag);
+
+    std::filesystem::path path{path_str};
+    if (std::filesystem::exists(path)) {
+        // NB: Provided path exists - upload input data from there.
+        LOG_INFO() << "Input data path: " << path << " for model: " << infer.tag << " exists - data will be uploaded"
+                   << std::endl;
+        auto layers_data = uploadData(path, infer.tag, infer.input_layers, LayersType::INPUT);
+        // NB: The Number of iterations for every layer is ALWAYS the same.
+        model_required_iterations = cv::util::make_optional(layers_data.begin()->second.size());
+        providers = createConstantProviders(std::move(layers_data), input_names);
+    } else {
+        // NB: Provided path doesn't exist - generate data and dump.
+        LOG_INFO() << "Input data path: " << path << " for model: " << infer.tag
+                   << " doesn't exist - input data will be generated and dumped" << std::endl;
+        std::vector<std::filesystem::path> dump_path_vec;
+        if (isDirectory(path)) {
+            // NB: When the directory is provided, the number of input iterations to be generated aren't
+            // bounded so the "random" providers will generate input data on every iteration that will
+            // be dumped on the disk afterwards.
+            dump_path_vec = createDirectoryLayout(path, input_names);
+        } else {
+            // NB: When the single file is provided, the execution must be limited to perform
+            // only 1 iteration.
+            model_required_iterations = cv::util::optional<uint64_t>(1ul);
+            if (infer.input_layers.size() > 1) {
+                THROW_ERROR("Model: " << infer.tag
+                                      << " must have exactly one input layer in order to dump input data to file: "
+                                      << path);
+            }
+            // NB: In case directories in that path don't exist.
+            std::filesystem::create_directories(path.parent_path());
+            dump_path_vec = {path};
+        }
+        auto default_initialzer =
+                opts.global_initializer ? opts.global_initializer : std::make_shared<UniformGenerator>(0.0, 255.0);
+        auto layer_initializers = unpackWithDefault(initializers, input_names, default_initialzer);
+        providers = createRandomProviders(infer.input_layers, std::move(layer_initializers));
+        for (uint32_t i = 0; i < infer.input_layers.size(); ++i) {
+            metas[i].set(Dump{dump_path_vec[i]});
+        }
+    }
+}
+
+struct OutputDataVisitor {
+    OutputDataVisitor(const InferDesc& _infer, const CalcRefSimulation::Options& _opts)
+            : infer(_infer), opts(_opts), metas(infer.output_layers.size()) {
+    }
+
+    void operator()(std::monostate);
+    void operator()(const std::string&);
+    void operator()(const LayerVariantAttr<std::string>&);
+
+    InferDesc infer;
+    const CalcRefSimulation::Options& opts;
+    std::vector<Meta> metas;
+};
+
+void OutputDataVisitor::operator()(std::monostate) {
+    THROW_ERROR("Reference mode requires output data path to be provided"
+                " in form of either directory or single file!");
+}
+
+void OutputDataVisitor::operator()(const LayerVariantAttr<std::string>&) {
+    THROW_ERROR("Reference mode requires output data path to be provided"
+                " in form of either directory or single file!");
+}
+
+void OutputDataVisitor::operator()(const std::string& path_str) {
+    std::filesystem::path path{path_str};
+    // NB: It doesn't matter if path exist or not - regenerate and dump outputs anyway.
+    std::vector<std::filesystem::path> dump_path_vec;
+    if (isDirectory(path)) {
+        dump_path_vec = createDirectoryLayout(path, extractLayerNames(infer.output_layers));
+    } else {
+        if (infer.output_layers.size() > 1) {
+            THROW_ERROR("Model: " << infer.tag
+                                  << " must have exactly one output layer in order to dump output data to file: "
+                                  << path);
+        }
+        dump_path_vec = {path};
+    }
+    for (uint32_t i = 0; i < infer.output_layers.size(); ++i) {
+        const auto& layer = infer.output_layers[i];
+        metas[i].set(Dump{dump_path_vec[i]});
+    }
+}
+
+}  // anonymous namespace
+
+class ReferenceStrategy : public IBuildStrategy {
+public:
+    explicit ReferenceStrategy(const CalcRefSimulation::Options& opts);
+
+    IBuildStrategy::InferBuildInfo build(const InferDesc& infer) override;
+
+    // NB: If specified will force execution to perform exactly require_num_iterations
+    // regardless what user specified.
+    // Use case is when N input iterations are provided,
+    // generate exactly the same amount of output iterations.
+    // Another use case is when there is only single file provided
+    // so only one input / output iteration must be generated.
+    cv::optional<uint64_t> required_num_iterations;
+    const CalcRefSimulation::Options& opts;
+};
+
+ReferenceStrategy::ReferenceStrategy(const CalcRefSimulation::Options& _opts): opts(_opts) {
+}
+
+IBuildStrategy::InferBuildInfo ReferenceStrategy::build(const InferDesc& infer) {
+    const auto& input_data = opts.input_data_map.at(infer.tag);
+    InputDataVisitor in_data_visitor{infer, opts};
+    std::visit(in_data_visitor, input_data);
+    // NB: Check if there is required number iterations for current model
+    // and fail if it's different comparing to other models in stream.
+    if (in_data_visitor.model_required_iterations) {
+        const uint64_t required_iters_value = in_data_visitor.model_required_iterations.value();
+        LOG_INFO() << "Model: " << infer.tag << " will perform at most " << required_iters_value << " iteration(s)"
+                   << std::endl;
+        if (!required_num_iterations) {
+            required_num_iterations = in_data_visitor.model_required_iterations;
+        } else {
+            if (required_iters_value != required_num_iterations.value()) {
+                THROW_ERROR("All models in stream are required to have the same number of iterations!");
+            }
+        }
+    }
+
+    const auto& output_data = opts.output_data_map.at(infer.tag);
+    OutputDataVisitor out_data_visitor{infer, opts};
+    std::visit(out_data_visitor, output_data);
+
+    return {std::move(in_data_visitor.providers), std::move(in_data_visitor.metas), std::move(out_data_visitor.metas)};
+}
+
+static void updateCriterion(ITermCriterion::Ptr* criterion, cv::util::optional<uint64_t> required_num_iterations) {
+    if (required_num_iterations.has_value()) {
+        if (*criterion) {
+            // NB: Limit user's termination criterion to perfom at most m_required_num_iterations
+            *criterion = std::make_shared<CombinedCriterion>(
+                    *criterion, std::make_shared<Iterations>(required_num_iterations.value()));
+        } else {
+            *criterion = std::make_shared<Iterations>(required_num_iterations.value());
+        }
+    }
+}
+
+static void dumpIterOutput(const cv::Mat& mat, const Dump& dump, const size_t iter) {
+    auto dump_path = dump.path;
+    if (isDirectory(dump.path)) {
+        std::stringstream ss;
+        ss << "iter_" << iter << ".bin";
+        dump_path = dump_path / ss.str();
+    }
+    utils::writeToBinFile(dump_path.string(), mat);
+};
+
+namespace {
+
+class SyncSimulation : public SyncCompiled {
+public:
+    SyncSimulation(cv::GCompiled&& compiled, std::vector<DummySource::Ptr>&& sources, std::vector<Meta>&& out_meta,
+                   cv::util::optional<uint64_t> required_num_iterations);
+
+    Result run(ITermCriterion::Ptr criterion) override;
+
+private:
+    bool process(cv::GCompiled& pipeline);
+
+    SyncExecutor m_exec;
+    std::vector<DummySource::Ptr> m_sources;
+    std::vector<Meta> m_out_meta;
+    std::vector<cv::Mat> m_out_mats;
+    size_t m_iter_idx;
+    cv::optional<uint64_t> m_required_num_iterations;
+};
+
+class PipelinedSimulation : public PipelinedCompiled {
+public:
+    PipelinedSimulation(cv::GStreamingCompiled&& compiled, std::vector<DummySource::Ptr>&& sources,
+                        std::vector<Meta>&& out_meta, cv::util::optional<uint64_t> required_num_iterations);
+
+    Result run(ITermCriterion::Ptr criterion) override;
+
+private:
+    bool process(cv::GStreamingCompiled& pipeline);
+
+    PipelinedExecutor m_exec;
+    std::vector<DummySource::Ptr> m_sources;
+    std::vector<Meta> m_out_meta;
+    std::vector<cv::optional<cv::Mat>> m_opt_mats;
+    size_t m_iter_idx;
+    cv::optional<uint64_t> m_required_num_iterations;
+};
+
+//////////////////////////////// SyncSimulation ///////////////////////////////
+SyncSimulation::SyncSimulation(cv::GCompiled&& compiled, std::vector<DummySource::Ptr>&& sources,
+                               std::vector<Meta>&& out_meta, cv::util::optional<uint64_t> required_num_iterations)
+        : m_exec(std::move(compiled)),
+          m_sources(std::move(sources)),
+          m_out_meta(std::move(out_meta)),
+          m_out_mats(m_out_meta.size()),
+          m_iter_idx(0u),
+          m_required_num_iterations(required_num_iterations) {
+}
+
+Result SyncSimulation::run(ITermCriterion::Ptr criterion) {
+    for (auto src : m_sources) {
+        src->reset();
+    }
+    using namespace std::placeholders;
+    auto cb = std::bind(&SyncSimulation::process, this, _1);
+    updateCriterion(&criterion, m_required_num_iterations);
+    m_exec.runLoop(cb, criterion);
+    std::stringstream ss;
+    ss << "Reference data has been generated for " << m_iter_idx << " iteration(s)";
+    return Success{ss.str()};
+};
+
+bool SyncSimulation::process(cv::GCompiled& pipeline) {
+    auto pipeline_outputs = cv::gout();
+    // NB: Reference is mandatory there since copying empty
+    // Mat may lead to weird side effects.
+    for (auto& out_mat : m_out_mats) {
+        pipeline_outputs += cv::gout(out_mat);
+    }
+    cv::GRunArgs pipeline_inputs;
+    pipeline_inputs.reserve(m_sources.size());
+    for (auto src : m_sources) {
+        cv::gapi::wip::Data data;
+        src->pull(data);
+        pipeline_inputs.push_back(std::move(data));
+    }
+    pipeline(std::move(pipeline_inputs), std::move(pipeline_outputs));
+    for (size_t i = 0; i < m_out_mats.size(); ++i) {
+        if (m_out_meta[i].has<Dump>()) {
+            const auto& dump = m_out_meta[i].get<Dump>();
+            dumpIterOutput(m_out_mats[i], dump, m_iter_idx);
+        }
+    }
+    ++m_iter_idx;
+    return true;
+}
+
+//////////////////////////////// PipelinedSimulation ///////////////////////////////
+PipelinedSimulation::PipelinedSimulation(cv::GStreamingCompiled&& compiled, std::vector<DummySource::Ptr>&& sources,
+                                         std::vector<Meta>&& out_meta,
+                                         cv::util::optional<uint64_t> required_num_iterations)
+        : m_exec(std::move(compiled)),
+          m_sources(std::move(sources)),
+          m_out_meta(std::move(out_meta)),
+          m_opt_mats(m_out_meta.size()),
+          m_iter_idx(0u),
+          m_required_num_iterations(required_num_iterations) {
+}
+
+Result PipelinedSimulation::run(ITermCriterion::Ptr criterion) {
+    auto pipeline_inputs = cv::gin();
+    for (auto source : m_sources) {
+        pipeline_inputs += cv::gin(static_cast<cv::gapi::wip::IStreamSource::Ptr>(source));
+    }
+    using namespace std::placeholders;
+    auto cb = std::bind(&PipelinedSimulation::process, this, _1);
+    updateCriterion(&criterion, m_required_num_iterations);
+    m_exec.runLoop(std::move(pipeline_inputs), cb, criterion);
+    std::stringstream ss;
+    ss << "Reference data has been generated for " << m_iter_idx << " iteration(s)";
+    return Success{ss.str()};
+};
+
+bool PipelinedSimulation::process(cv::GStreamingCompiled& pipeline) {
+    cv::GOptRunArgsP pipeline_outputs;
+    for (auto& opt_mat : m_opt_mats) {
+        pipeline_outputs.emplace_back(cv::gout(opt_mat)[0]);
+    }
+    const bool has_data = pipeline.pull(std::move(pipeline_outputs));
+    for (size_t i = 0; i < m_out_meta.size(); ++i) {
+        if (m_out_meta[i].has<Dump>()) {
+            const auto& dump = m_out_meta[i].get<Dump>();
+            ASSERT(m_opt_mats[i].has_value());
+            dumpIterOutput(m_opt_mats[i].value(), dump, m_iter_idx);
+        }
+    }
+    ++m_iter_idx;
+    return has_data;
+}
+
+}  // anonymous namespace
+
+CalcRefSimulation::CalcRefSimulation(Simulation::Config&& cfg, CalcRefSimulation::Options&& opts)
+        : Simulation(std::move(cfg)),
+          m_opts(std::move(opts)),
+          m_strategy(std::make_shared<ReferenceStrategy>(m_opts)),
+          m_comp(ComputationBuilder{m_strategy}.build(m_cfg.graph, m_cfg.params, {false /* add performance meta */})) {
+}
+
+std::shared_ptr<PipelinedCompiled> CalcRefSimulation::compilePipelined(DummySources&& sources,
+                                                                       cv::GCompileArgs&& compile_args) {
+    auto compiled = m_comp.compileStreaming(descr_of(sources), std::move(compile_args));
+    auto out_meta = m_comp.getOutMeta();
+    return std::make_shared<PipelinedSimulation>(std::move(compiled), std::move(sources), std::move(out_meta),
+                                                 m_strategy->required_num_iterations);
+}
+
+std::shared_ptr<SyncCompiled> CalcRefSimulation::compileSync(DummySources&& sources, cv::GCompileArgs&& compile_args) {
+    auto compiled = m_comp.compile(descr_of(sources), std::move(compile_args));
+    auto out_meta = m_comp.getOutMeta();
+    return std::make_shared<SyncSimulation>(std::move(compiled), std::move(sources), std::move(out_meta),
+                                            m_strategy->required_num_iterations);
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/reference_mode.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/reference_mode.hpp
new file mode 100644
index 00000000000000..22d2fd92cce2c6
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/reference_mode.hpp
@@ -0,0 +1,35 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+
+#include "simulation/computation.hpp"
+#include "simulation/simulation.hpp"
+
+class ReferenceStrategy;
+class CalcRefSimulation : public Simulation {
+public:
+    struct Options {
+        // FIXME: In fact, there should be only input data initializers
+        // and the path where to dump outputs
+        IRandomGenerator::Ptr global_initializer;
+        ModelsAttrMap<IRandomGenerator::Ptr> initializers_map;
+        ModelsAttrMap<std::string> input_data_map;
+        ModelsAttrMap<std::string> output_data_map;
+    };
+
+    explicit CalcRefSimulation(Simulation::Config&& cfg, Options&& opts);
+
+    std::shared_ptr<PipelinedCompiled> compilePipelined(DummySources&& sources,
+                                                        cv::GCompileArgs&& compile_args) override;
+    std::shared_ptr<SyncCompiled> compileSync(DummySources&& sources, cv::GCompileArgs&& compiler_args) override;
+
+private:
+    Options m_opts;
+    std::shared_ptr<ReferenceStrategy> m_strategy;
+    Computation m_comp;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp
new file mode 100644
index 00000000000000..52f57c2881a3b6
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp
@@ -0,0 +1,131 @@
+//
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "simulation/simulation.hpp"
+
+#include "scenario/inference.hpp"
+#include "utils/error.hpp"
+
+#include <opencv2/gapi/infer/onnx.hpp>  // onnx::Params
+#include <opencv2/gapi/infer/ov.hpp>    // ov::Params
+
+static cv::gapi::GNetPackage getNetPackage(const std::string& tag, const OpenVINOParams& params) {
+    using P = cv::gapi::ov::Params<cv::gapi::Generic>;
+    std::unique_ptr<P> network;
+    if (std::holds_alternative<OpenVINOParams::ModelPath>(params.path)) {
+        const auto& model_path = std::get<OpenVINOParams::ModelPath>(params.path);
+        network = std::make_unique<P>(tag, model_path.model, model_path.bin, params.device);
+    } else {
+        GAPI_Assert(std::holds_alternative<OpenVINOParams::BlobPath>(params.path));
+        const auto& blob_path = std::get<OpenVINOParams::BlobPath>(params.path);
+        network = std::make_unique<P>(tag, blob_path.blob, params.device);
+    }
+
+    network->cfgPluginConfig(params.config);
+    network->cfgNumRequests(params.nireq);
+
+    // NB: Pre/Post processing can be configured only for Model case.
+    if (std::holds_alternative<OpenVINOParams::ModelPath>(params.path)) {
+        if (std::holds_alternative<int>(params.output_precision)) {
+            network->cfgOutputTensorPrecision(std::get<int>(params.output_precision));
+        } else if (std::holds_alternative<AttrMap<int>>(params.output_precision)) {
+            network->cfgOutputTensorPrecision(std::get<AttrMap<int>>(params.output_precision));
+        }
+
+        if (std::holds_alternative<std::string>(params.input_layout)) {
+            network->cfgInputTensorLayout(std::get<std::string>(params.input_layout));
+        } else if (std::holds_alternative<AttrMap<std::string>>(params.input_layout)) {
+            network->cfgInputTensorLayout(std::get<AttrMap<std::string>>(params.input_layout));
+        }
+
+        if (std::holds_alternative<std::string>(params.output_layout)) {
+            network->cfgOutputTensorLayout(std::get<std::string>(params.output_layout));
+        } else if (std::holds_alternative<AttrMap<std::string>>(params.output_layout)) {
+            network->cfgOutputTensorLayout(std::get<AttrMap<std::string>>(params.output_layout));
+        }
+
+        if (std::holds_alternative<std::string>(params.input_model_layout)) {
+            network->cfgInputModelLayout(std::get<std::string>(params.input_model_layout));
+        } else if (std::holds_alternative<AttrMap<std::string>>(params.input_model_layout)) {
+            network->cfgInputModelLayout(std::get<AttrMap<std::string>>(params.input_model_layout));
+        }
+
+        if (std::holds_alternative<std::string>(params.output_model_layout)) {
+            network->cfgOutputModelLayout(std::get<std::string>(params.output_model_layout));
+        } else if (std::holds_alternative<AttrMap<std::string>>(params.output_model_layout)) {
+            network->cfgOutputModelLayout(std::get<AttrMap<std::string>>(params.output_model_layout));
+        }
+    }
+    return cv::gapi::networks(*network);
+}
+
+static void cfgExecutionProvider(cv::gapi::onnx::Params<cv::gapi::Generic>& network,
+                                 const ONNXRTParams::OpenVINO& ovep) {
+    network.cfgAddExecutionProvider(cv::gapi::onnx::ep::OpenVINO{ovep.params_map});
+}
+
+static void cfgExecutionProvider(cv::gapi::onnx::Params<cv::gapi::Generic>& network, const ONNXRTParams::EP& ep) {
+    // NB: Nothing to configure for default MLAS EP
+    if (std::holds_alternative<std::monostate>(ep)) {
+        return;
+    }
+    // TODO: Extend for any other available execution provider
+    ASSERT(std::holds_alternative<ONNXRTParams::OpenVINO>(ep));
+    cfgExecutionProvider(network, std::get<ONNXRTParams::OpenVINO>(ep));
+}
+
+static cv::gapi::GNetPackage getNetPackage(const std::string& tag, const ONNXRTParams& params) {
+    cv::gapi::onnx::Params<cv::gapi::Generic> network{tag, params.model_path};
+    network.cfgSessionOptions(params.session_options);
+    cfgExecutionProvider(network, params.ep);
+    return cv::gapi::networks(network);
+}
+
+static cv::gapi::GNetPackage getNetPackage(const std::string& tag, const InferenceParams& params) {
+    if (std::holds_alternative<OpenVINOParams>(params)) {
+        return getNetPackage(tag, std::get<OpenVINOParams>(params));
+    }
+    ASSERT(std::holds_alternative<ONNXRTParams>(params));
+    return getNetPackage(tag, std::get<ONNXRTParams>(params));
+}
+
+cv::gapi::GNetPackage Simulation::getNetworksPackage() const {
+    cv::gapi::GNetPackage networks;
+    for (const auto& [tag, params] : m_cfg.params) {
+        networks += getNetPackage(tag, params);
+    }
+    return networks;
+}
+
+Simulation::Simulation(Config&& cfg): m_cfg(std::move(cfg)){};
+
+std::vector<DummySource::Ptr> Simulation::createSources(const bool drop_frames) {
+    auto src = std::make_shared<DummySource>(m_cfg.frames_interval_in_us, drop_frames,
+                                             m_cfg.disable_high_resolution_timer);
+    return {src};
+};
+
+std::shared_ptr<PipelinedCompiled> Simulation::compilePipelined(const bool drop_frames) {
+    if (drop_frames) {
+        THROW_ERROR("Pipelined simulation doesn't support frames drop!");
+    }
+    // NB: Hardcoded for pipelining mode as the best option
+    auto compile_args = cv::compile_args(getNetworksPackage());
+    compile_args += cv::compile_args(cv::gapi::streaming::queue_capacity{1u});
+    return compilePipelined(createSources(drop_frames), std::move(compile_args));
+}
+
+std::shared_ptr<SyncCompiled> Simulation::compileSync(const bool drop_frames) {
+    auto compile_args = cv::compile_args(getNetworksPackage());
+    return compileSync(createSources(drop_frames), std::move(compile_args));
+}
+
+std::shared_ptr<PipelinedCompiled> Simulation::compilePipelined(DummySources&&, cv::GCompileArgs&&) {
+    THROW_ERROR("Not implemented!");
+};
+
+std::shared_ptr<SyncCompiled> Simulation::compileSync(DummySources&&, cv::GCompileArgs&&) {
+    THROW_ERROR("Not implemented!");
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.hpp
new file mode 100644
index 00000000000000..b60eaf6b5a3148
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.hpp
@@ -0,0 +1,57 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+
+#include "result.hpp"
+#include "scenario/criterion.hpp"
+#include "scenario/inference.hpp"
+#include "scenario/scenario_graph.hpp"
+#include "simulation/dummy_source.hpp"
+
+#include <opencv2/gapi/infer.hpp>  // cv::gapi::GNetPackage
+
+struct ICompiled {
+    using Ptr = std::shared_ptr<ICompiled>;
+    virtual Result run(ITermCriterion::Ptr) = 0;
+};
+
+struct PipelinedCompiled : public ICompiled {};
+struct SyncCompiled : public ICompiled {};
+
+using DummySources = std::vector<DummySource::Ptr>;
+
+class Simulation {
+public:
+    using Ptr = std::shared_ptr<Simulation>;
+
+    struct Config {
+        std::string stream_name;
+        uint64_t frames_interval_in_us;
+        bool disable_high_resolution_timer;
+        ScenarioGraph graph;
+        InferenceParamsMap params;
+    };
+
+    explicit Simulation(Config&& cfg);
+
+    virtual std::shared_ptr<PipelinedCompiled> compilePipelined(const bool drop_frames);
+    virtual std::shared_ptr<SyncCompiled> compileSync(const bool drop_frames);
+
+    virtual ~Simulation() = default;
+
+protected:
+    virtual std::shared_ptr<PipelinedCompiled> compilePipelined(DummySources&& sources,
+                                                                cv::GCompileArgs&& compile_args);
+    virtual std::shared_ptr<SyncCompiled> compileSync(DummySources&& sources, cv::GCompileArgs&& compile_args);
+
+    std::vector<DummySource::Ptr> createSources(const bool drop_frames);
+    cv::gapi::GNetPackage getNetworksPackage() const;
+
+protected:
+    Config m_cfg;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/validation_mode.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/validation_mode.cpp
new file mode 100644
index 00000000000000..c6544522287048
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/validation_mode.cpp
@@ -0,0 +1,363 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "simulation/validation_mode.hpp"
+
+#include "scenario/accuracy_metrics.hpp"
+#include "simulation/computation_builder.hpp"
+#include "simulation/executor.hpp"
+#include "simulation/layers_data.hpp"
+#include "simulation/validation_mode.hpp"
+#include "utils/logger.hpp"
+#include "utils/utils.hpp"
+
+#include <opencv2/gapi/gproto.hpp>  // cv::GCompileArgs
+
+class LayerValidator {
+public:
+    LayerValidator(const std::string& tag, const std::string& layer_name, IAccuracyMetric::Ptr metric);
+    Result operator()(const cv::Mat& lhs, const cv::Mat& rhs);
+
+private:
+    std::string m_tag;
+    std::string m_layer_name;
+    IAccuracyMetric::Ptr m_metric;
+};
+
+LayerValidator::LayerValidator(const std::string& tag, const std::string& layer_name, IAccuracyMetric::Ptr metric)
+        : m_tag(tag), m_layer_name(layer_name), m_metric(metric) {
+}
+
+Result LayerValidator::operator()(const cv::Mat& lhs, const cv::Mat& rhs) {
+    auto result = m_metric->compare(lhs, rhs);
+    if (!result) {
+        std::stringstream ss;
+        ss << "Model: " << m_tag << ", Layer: " << m_layer_name << ", Metric: " << m_metric->str()
+           << ", Reason: " << result.str() << ";";
+        return Error{ss.str()};
+    }
+    return Success{"Passed"};
+}
+
+namespace {
+
+struct InputDataVisitor {
+    InputDataVisitor(const InferDesc& _infer, const ValSimulation::Options& _opts)
+            : infer(_infer), opts(_opts), providers(infer.input_layers.size()), metas(infer.input_layers.size()) {
+    }
+
+    void operator()(std::monostate);
+    void operator()(const std::string&);
+    void operator()(const LayerVariantAttr<std::string>&);
+
+    InferDesc infer;
+    const ValSimulation::Options& opts;
+    std::vector<IDataProvider::Ptr> providers;
+    std::vector<Meta> metas;
+};
+
+void InputDataVisitor::operator()(std::monostate) {
+    THROW_ERROR("Validation mode requires input data path to be provided"
+                " in form of either directory or single file!");
+};
+
+void InputDataVisitor::operator()(const LayerVariantAttr<std::string>&) {
+    THROW_ERROR("Validation mode requires input data path to be provided"
+                " in form of either directory or single file!");
+};
+
+void InputDataVisitor::operator()(const std::string& path_str) {
+    std::filesystem::path path{path_str};
+    LOG_INFO() << "Input data path: " << path << " for model: " << infer.tag << " exists - data will be uploaded"
+               << std::endl;
+    auto layers_data = uploadData(path, infer.tag, infer.input_layers, LayersType::INPUT);
+    providers = createConstantProviders(std::move(layers_data), extractLayerNames(infer.input_layers));
+};
+
+struct OutputDataVisitor {
+    OutputDataVisitor(const InferDesc& _infer, const ValSimulation::Options& _opts)
+            : infer(_infer), opts(_opts), metas(infer.output_layers.size()) {
+    }
+
+    void operator()(std::monostate);
+    void operator()(const std::string&);
+    void operator()(const LayerVariantAttr<std::string>&);
+
+    InferDesc infer;
+    const ValSimulation::Options& opts;
+    std::vector<Meta> metas;
+};
+
+void OutputDataVisitor::operator()(std::monostate) {
+    THROW_ERROR("Validation mode requires output data path to be provided"
+                " in form of either directory or single file!");
+}
+
+void OutputDataVisitor::operator()(const LayerVariantAttr<std::string>&) {
+    THROW_ERROR("Validation mode requires output data path to be provided"
+                " in form of either directory or single file!");
+}
+
+void OutputDataVisitor::operator()(const std::string& path_str) {
+    auto default_metric = opts.global_metric ? opts.global_metric : std::make_shared<Norm>(0.0);
+    auto per_layer_metrics =
+            unpackWithDefault(opts.metrics_map.at(infer.tag), extractLayerNames(infer.output_layers), default_metric);
+    std::filesystem::path path{path_str};
+    LOG_INFO() << "Reference output data path: " << path << " for model: " << infer.tag
+               << " exists - data will be uploaded" << std::endl;
+    auto layers_data = uploadData(path, infer.tag, infer.output_layers, LayersType::OUTPUT);
+    for (uint32_t i = 0; i < infer.output_layers.size(); ++i) {
+        const auto& layer = infer.output_layers[i];
+        LayerValidator validator{infer.tag, layer.name, per_layer_metrics.at(layer.name)};
+        metas[i].set(Validate{std::move(validator), layers_data.at(layer.name)});
+    }
+}
+
+}  // anonymous namespace
+
+class ValidationStrategy : public IBuildStrategy {
+public:
+    explicit ValidationStrategy(const ValSimulation::Options& _opts): opts(_opts) {
+    }
+
+    IBuildStrategy::InferBuildInfo build(const InferDesc& infer) override {
+        const auto& input_data = opts.input_data_map.at(infer.tag);
+        InputDataVisitor in_data_visitor{infer, opts};
+        std::visit(in_data_visitor, input_data);
+
+        const auto& output_data = opts.output_data_map.at(infer.tag);
+        OutputDataVisitor out_data_visitor{infer, opts};
+        std::visit(out_data_visitor, output_data);
+
+        if (opts.per_iter_outputs_path.has_value()) {
+            auto model_dir = opts.per_iter_outputs_path.value() / infer.tag;
+            // NB: Remove the data from the previous run if such exist
+            LOG_INFO() << "Actual output data for model: " << infer.tag
+                       << " will be dumped and replaced at path: " << model_dir << std::endl;
+            std::filesystem::remove_all(model_dir);
+            auto dump_path_vec = createDirectoryLayout(model_dir, extractLayerNames(infer.output_layers));
+            for (uint32_t i = 0; i < infer.output_layers.size(); ++i) {
+                out_data_visitor.metas[i].set(Dump{dump_path_vec[i]});
+            }
+        }
+
+        // NB: No special input meta for this mode.
+        std::vector<Meta> input_meta(infer.input_layers.size(), Meta{});
+        return {std::move(in_data_visitor.providers), std::move(input_meta), std::move(out_data_visitor.metas)};
+    }
+
+    const ValSimulation::Options& opts;
+};
+
+struct FailedIter {
+    size_t iter_idx;
+    std::vector<std::string> reasons;
+};
+
+static Result reportValidationResult(const std::vector<FailedIter>& failed_iters, const size_t total_iters) {
+    std::stringstream ss;
+    if (!failed_iters.empty()) {
+        const auto kItersToShow = 10u;
+        const auto kLimit = failed_iters.size() < kItersToShow ? failed_iters.size() : kItersToShow;
+        ss << "Accuraccy check failed on " << failed_iters.size() << " iteration(s)"
+           << " (first " << kLimit << "):";
+        ss << "\n";
+        for (uint32_t i = 0; i < kLimit; ++i) {
+            ss << "Iteration " << failed_iters[i].iter_idx << ":\n";
+            for (const auto& reason : failed_iters[i].reasons) {
+                ss << "  " << reason << "\n";
+            }
+        }
+        return Error{ss.str()};
+    }
+    ss << "Validation has passed for " << total_iters << " iteration(s)";
+    return Success{ss.str()};
+}
+
+static std::vector<std::string> validateOutputs(const std::vector<cv::Mat>& out_mats, const std::vector<Meta>& out_meta,
+                                                const size_t iter_idx) {
+    std::vector<std::string> failed_list;
+    for (size_t i = 0; i < out_mats.size(); ++i) {
+        if (out_meta[i].has<Validate>()) {
+            const auto& val = out_meta[i].get<Validate>();
+            const auto& refvec = val.reference;
+            ASSERT(!refvec.empty());
+            const auto& refmat = refvec[iter_idx % refvec.size()];
+            auto result = val.validator(refmat, out_mats[i]);
+            if (!result) {
+                failed_list.push_back(std::move(result.str()));
+            }
+        }
+    }
+    return failed_list;
+}
+
+static void dumpOutputs(const std::vector<cv::Mat>& out_mats, const std::vector<Meta>& out_meta,
+                        const size_t iter_idx) {
+    for (size_t i = 0; i < out_mats.size(); ++i) {
+        if (out_meta[i].has<Dump>()) {
+            std::stringstream ss;
+            ss << "iter_" << iter_idx << ".bin";
+            auto dump_path = out_meta[i].get<Dump>().path / ss.str();
+            utils::writeToBinFile(dump_path.string(), out_mats[i]);
+        }
+    }
+}
+
+namespace {
+
+class SyncSimulation : public SyncCompiled {
+public:
+    SyncSimulation(cv::GCompiled&& compiled, std::vector<DummySource::Ptr>&& sources, std::vector<Meta>&& out_meta);
+
+    Result run(ITermCriterion::Ptr criterion) override;
+
+private:
+    bool process(cv::GCompiled& pipeline);
+
+    SyncExecutor m_exec;
+    std::vector<DummySource::Ptr> m_sources;
+    std::vector<Meta> m_out_meta;
+    std::vector<cv::Mat> m_out_mats;
+    size_t m_iter_idx;
+    std::vector<FailedIter> m_failed_iters;
+};
+
+class PipelinedSimulation : public PipelinedCompiled {
+public:
+    PipelinedSimulation(cv::GStreamingCompiled&& compiled, std::vector<DummySource::Ptr>&& sources,
+                        std::vector<Meta>&& out_meta);
+
+    Result run(ITermCriterion::Ptr criterion) override;
+
+private:
+    bool process(cv::GStreamingCompiled& pipeline);
+
+    PipelinedExecutor m_exec;
+    std::vector<DummySource::Ptr> m_sources;
+    std::vector<Meta> m_out_meta;
+    std::vector<cv::optional<cv::Mat>> m_opt_mats;
+    size_t m_iter_idx;
+    std::vector<FailedIter> m_failed_iters;
+};
+
+//////////////////////////////// SyncSimulation ///////////////////////////////
+SyncSimulation::SyncSimulation(cv::GCompiled&& compiled, std::vector<DummySource::Ptr>&& sources,
+                               std::vector<Meta>&& out_meta)
+        : m_exec(std::move(compiled)),
+          m_sources(std::move(sources)),
+          m_out_meta(std::move(out_meta)),
+          m_out_mats(m_out_meta.size()),
+          m_iter_idx(0u) {
+}
+
+Result SyncSimulation::run(ITermCriterion::Ptr criterion) {
+    for (auto src : m_sources) {
+        src->reset();
+    }
+    using namespace std::placeholders;
+    auto cb = std::bind(&SyncSimulation::process, this, _1);
+    m_exec.runLoop(cb, criterion);
+    return reportValidationResult(m_failed_iters, m_iter_idx);
+};
+
+bool SyncSimulation::process(cv::GCompiled& pipeline) {
+    auto pipeline_outputs = cv::gout();
+    // NB: Reference is mandatory there since copying empty
+    // Mat may lead to weird side effects.
+    for (auto& out_mat : m_out_mats) {
+        pipeline_outputs += cv::gout(out_mat);
+    }
+    cv::GRunArgs pipeline_inputs;
+    pipeline_inputs.reserve(m_sources.size());
+    for (auto src : m_sources) {
+        cv::gapi::wip::Data data;
+        src->pull(data);
+        pipeline_inputs.push_back(std::move(data));
+    }
+    pipeline(std::move(pipeline_inputs), std::move(pipeline_outputs));
+
+    dumpOutputs(m_out_mats, m_out_meta, m_iter_idx);
+    auto failed_list = validateOutputs(m_out_mats, m_out_meta, m_iter_idx);
+    if (!failed_list.empty()) {
+        m_failed_iters.push_back(FailedIter{m_iter_idx, std::move(failed_list)});
+    }
+    ++m_iter_idx;
+    return true;
+}
+
+//////////////////////////////// PipelinedSimulation ///////////////////////////////
+PipelinedSimulation::PipelinedSimulation(cv::GStreamingCompiled&& compiled, std::vector<DummySource::Ptr>&& sources,
+                                         std::vector<Meta>&& out_meta)
+        : m_exec(std::move(compiled)),
+          m_sources(std::move(sources)),
+          m_out_meta(std::move(out_meta)),
+          m_opt_mats(m_out_meta.size()),
+          m_iter_idx(0u) {
+}
+
+Result PipelinedSimulation::run(ITermCriterion::Ptr criterion) {
+    auto pipeline_inputs = cv::gin();
+    for (auto source : m_sources) {
+        pipeline_inputs += cv::gin(static_cast<cv::gapi::wip::IStreamSource::Ptr>(source));
+    }
+    using namespace std::placeholders;
+    auto cb = std::bind(&PipelinedSimulation::process, this, _1);
+    m_exec.runLoop(std::move(pipeline_inputs), cb, criterion);
+    return reportValidationResult(m_failed_iters, m_iter_idx);
+};
+
+bool PipelinedSimulation::process(cv::GStreamingCompiled& pipeline) {
+    cv::GOptRunArgsP pipeline_outputs;
+    for (auto& opt_mat : m_opt_mats) {
+        pipeline_outputs.emplace_back(cv::gout(opt_mat)[0]);
+    }
+    const bool has_data = pipeline.pull(std::move(pipeline_outputs));
+    std::vector<cv::Mat> out_mats;
+    out_mats.reserve(m_opt_mats.size());
+    for (auto opt_mat : m_opt_mats) {
+        ASSERT(opt_mat.has_value());
+        out_mats.push_back(opt_mat.value());
+    }
+
+    dumpOutputs(out_mats, m_out_meta, m_iter_idx);
+    auto failed_list = validateOutputs(out_mats, m_out_meta, m_iter_idx);
+    if (!failed_list.empty()) {
+        m_failed_iters.push_back(FailedIter{m_iter_idx, std::move(failed_list)});
+    }
+    ++m_iter_idx;
+    return has_data;
+}
+
+}  // anonymous namespace
+
+ValSimulation::ValSimulation(Simulation::Config&& cfg, ValSimulation::Options&& opts)
+        : Simulation(std::move(cfg)),
+          m_opts(std::move(opts)),
+          m_strategy(std::make_shared<ValidationStrategy>(m_opts)),
+          m_comp(ComputationBuilder{m_strategy}.build(m_cfg.graph, m_cfg.params, {false /* add performance meta */})) {
+}
+
+std::shared_ptr<PipelinedCompiled> ValSimulation::compilePipelined(DummySources&& sources,
+                                                                   cv::GCompileArgs&& compile_args) {
+    auto compiled = m_comp.compileStreaming(descr_of(sources), std::move(compile_args));
+    auto out_meta = m_comp.getOutMeta();
+    return std::make_shared<PipelinedSimulation>(std::move(compiled), std::move(sources), std::move(out_meta));
+}
+
+std::shared_ptr<SyncCompiled> ValSimulation::compileSync(DummySources&& sources, cv::GCompileArgs&& compile_args) {
+    const uint32_t max_parallel_branches = m_comp.getMaxParallelBranches();
+    if (max_parallel_branches > 1u) {
+        LOG_INFO() << "Found at most " << max_parallel_branches
+                   << " parallel branches in graph,"
+                      " so threaded executor will be used"
+                   << std::endl;
+        ;
+        compile_args += cv::compile_args(cv::use_threaded_executor{max_parallel_branches});
+    }
+    auto compiled = m_comp.compile(descr_of(sources), std::move(compile_args));
+    auto out_meta = m_comp.getOutMeta();
+    return std::make_shared<SyncSimulation>(std::move(compiled), std::move(sources), std::move(out_meta));
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/validation_mode.hpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/validation_mode.hpp
new file mode 100644
index 00000000000000..180c802803a68c
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/validation_mode.hpp
@@ -0,0 +1,34 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <filesystem>
+#include <memory>
+
+#include "simulation/computation.hpp"
+#include "simulation/simulation.hpp"
+
+class ValidationStrategy;
+class ValSimulation : public Simulation {
+public:
+    struct Options {
+        IAccuracyMetric::Ptr global_metric;
+        ModelsAttrMap<IAccuracyMetric::Ptr> metrics_map;
+        ModelsAttrMap<std::string> input_data_map;
+        ModelsAttrMap<std::string> output_data_map;
+        std::optional<std::filesystem::path> per_iter_outputs_path;
+    };
+    explicit ValSimulation(Simulation::Config&& cfg, Options&& opts);
+
+    std::shared_ptr<PipelinedCompiled> compilePipelined(DummySources&& sources,
+                                                        cv::GCompileArgs&& compile_args) override;
+    std::shared_ptr<SyncCompiled> compileSync(DummySources&& sources, cv::GCompileArgs&& compiler_args) override;
+
+private:
+    Options m_opts;
+    std::shared_ptr<ValidationStrategy> m_strategy;
+    Computation m_comp;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/utils/data_providers.cpp b/src/plugins/intel_npu/tools/protopipe/src/utils/data_providers.cpp
new file mode 100644
index 00000000000000..f3eaf7756e1793
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/utils/data_providers.cpp
@@ -0,0 +1,64 @@
+//
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+
+#include "data_providers.hpp"
+
+#include <sstream>
+
+#include "utils.hpp"
+#include "utils/error.hpp"
+
+UniformGenerator::UniformGenerator(double low, double high): m_low(low), m_high(high) {
+    ASSERT(low <= high);
+}
+
+void UniformGenerator::generate(cv::Mat& mat) {
+    cv::randu(mat, m_low, m_high);
+}
+
+std::string UniformGenerator::str() const {
+    std::stringstream ss;
+    ss << "{dist: uniform, range: [" << m_low << ", " << m_high << "]}";
+    return ss.str();
+}
+
+RandomProvider::RandomProvider(IRandomGenerator::Ptr impl, const std::vector<int>& dims, const int depth)
+        : m_impl(impl), m_dims(dims), m_depth(depth) {
+}
+
+void RandomProvider::pull(cv::Mat& mat) {
+    utils::createNDMat(mat, m_dims, m_depth);
+    m_impl->generate(mat);
+}
+
+cv::GMatDesc RandomProvider::desc() {
+    if (m_dims.size() == 2u) {
+        return cv::GMatDesc{m_depth, 1, cv::Size(m_dims[1], m_dims[0])};
+    }
+    return cv::GMatDesc{m_depth, m_dims};
+}
+
+CircleBuffer::CircleBuffer(const std::vector<cv::Mat>& buffer): m_buffer(buffer), m_pos(0u) {
+    ASSERT(!m_buffer.empty());
+}
+
+CircleBuffer::CircleBuffer(std::vector<cv::Mat>&& buffer): m_buffer(std::move(buffer)), m_pos(0u) {
+    ASSERT(!m_buffer.empty());
+}
+
+CircleBuffer::CircleBuffer(cv::Mat mat): CircleBuffer(std::vector<cv::Mat>{mat}) {
+}
+
+void CircleBuffer::pull(cv::Mat& mat) {
+    m_buffer[m_pos++].copyTo(mat);
+    if (m_pos == m_buffer.size()) {
+        m_pos = 0;
+    }
+}
+
+cv::GMatDesc CircleBuffer::desc() {
+    return cv::descr_of(m_buffer[0]);
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/utils/data_providers.hpp b/src/plugins/intel_npu/tools/protopipe/src/utils/data_providers.hpp
new file mode 100644
index 00000000000000..2bd45b7f19cc25
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/utils/data_providers.hpp
@@ -0,0 +1,70 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <opencv2/gapi/gmat.hpp>
+
+struct IDataProvider {
+    using Ptr = std::shared_ptr<IDataProvider>;
+    virtual void pull(cv::Mat& mat) = 0;
+    virtual cv::GMatDesc desc() = 0;
+    virtual void reset() = 0;
+    virtual ~IDataProvider() = default;
+};
+
+class IRandomGenerator {
+public:
+    using Ptr = std::shared_ptr<IRandomGenerator>;
+    virtual void generate(cv::Mat& mat) = 0;
+    virtual ~IRandomGenerator() = default;
+    virtual std::string str() const = 0;
+};
+
+class UniformGenerator : public IRandomGenerator {
+public:
+    using Ptr = std::shared_ptr<UniformGenerator>;
+    UniformGenerator(double low, double high);
+    void generate(cv::Mat& mat) override;
+    virtual std::string str() const override;
+
+private:
+    double m_low, m_high;
+};
+
+class RandomProvider : public IDataProvider {
+public:
+    RandomProvider(IRandomGenerator::Ptr impl, const std::vector<int>& dims, const int depth);
+
+    void pull(cv::Mat& mat) override;
+    cv::GMatDesc desc() override;
+    void reset() override { /* do nothing */
+    }
+
+private:
+    IRandomGenerator::Ptr m_impl;
+    std::vector<int> m_dims;
+    int m_depth;
+};
+
+class CircleBuffer : public IDataProvider {
+public:
+    CircleBuffer(const std::vector<cv::Mat>& buffer);
+    CircleBuffer(std::vector<cv::Mat>&& buffer);
+    CircleBuffer(cv::Mat mat);
+
+    void pull(cv::Mat& mat) override;
+    cv::GMatDesc desc() override;
+    void reset() override {
+        m_pos = 0;
+    }
+
+private:
+    std::vector<cv::Mat> m_buffer;
+    uint64_t m_pos;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/utils/error.hpp b/src/plugins/intel_npu/tools/protopipe/src/utils/error.hpp
new file mode 100644
index 00000000000000..23cb2a8f46436c
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/utils/error.hpp
@@ -0,0 +1,39 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+namespace details {
+
+[[noreturn]] inline void assert_abort(const char* str, const int line, const char* file, const char* func) {
+    std::stringstream ss;
+    ss << file << ":" << line << ": Assertion " << str << " in function " << func << " failed\n";
+    std::cerr << ss.str() << std::flush;
+    abort();
+}
+
+[[noreturn]] inline void throw_error(const char* str) {
+    std::stringstream ss;
+    ss << "An exception thrown! " << str << std::flush;
+    throw std::logic_error(ss.str());
+}
+
+}  // namespace details
+
+#define ASSERT(expr)                                                      \
+    {                                                                     \
+        if (!(expr))                                                      \
+            ::details::assert_abort(#expr, __LINE__, __FILE__, __func__); \
+    }
+
+#define THROW_ERROR(msg)                          \
+    {                                             \
+        std::ostringstream os;                    \
+        os << msg;                                \
+        ::details::throw_error(os.str().c_str()); \
+    }
diff --git a/src/plugins/intel_npu/tools/protopipe/src/utils/logger.cpp b/src/plugins/intel_npu/tools/protopipe/src/utils/logger.cpp
new file mode 100644
index 00000000000000..ccba64e701975c
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/utils/logger.cpp
@@ -0,0 +1,32 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "utils/logger.hpp"
+
+#include <iostream>
+
+LogLevel Logger::global_lvl = LogLevel::None;
+
+Logger::Logger(LogLevel lvl): m_lvl(lvl) {
+}
+
+std::stringstream& Logger::stream() {
+    return m_ss;
+}
+
+Logger::~Logger() {
+    if (m_lvl <= Logger::global_lvl) {
+        switch (m_lvl) {
+        case LogLevel::Info:
+            std::cout << "[ INFO ] " << m_ss.str();
+            break;
+        case LogLevel::Debug:
+            std::cout << "[ DEBUG ] " << m_ss.str();
+            break;
+        default:
+                /* do nothing */;
+        }
+    }
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/utils/logger.hpp b/src/plugins/intel_npu/tools/protopipe/src/utils/logger.hpp
new file mode 100644
index 00000000000000..e8b1f5df7f8fa3
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/utils/logger.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <sstream>
+
+enum class LogLevel {
+    None = 0,
+    Info = 1,
+    Debug = 2,
+};
+
+class Logger {
+public:
+    static LogLevel global_lvl;
+    explicit Logger(LogLevel lvl);
+    std::stringstream& stream();
+    ~Logger();
+
+private:
+    LogLevel m_lvl;
+    std::stringstream m_ss;
+};
+
+#define LOG_INFO() Logger{LogLevel::Info}.stream()
+#define LOG_DEBUG() Logger{LogLevel::Debug}.stream()
diff --git a/src/plugins/intel_npu/tools/protopipe/src/utils/timer.cpp b/src/plugins/intel_npu/tools/protopipe/src/utils/timer.cpp
new file mode 100644
index 00000000000000..a1fc0f4c2643c4
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/utils/timer.cpp
@@ -0,0 +1,73 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "timer.hpp"
+#include "utils.hpp"
+
+#include <thread>
+
+#if defined(_WIN32)
+#include <windows.h>
+
+class WinTimer : public SleepTimer {
+public:
+    WinTimer(bool disable_high_resolution_timer);
+    void wait(std::chrono::microseconds time) override;
+    ~WinTimer();
+
+private:
+    HANDLE m_handle = nullptr;
+};
+
+WinTimer::WinTimer(bool disable_high_resolution_timer) {
+    // FIXME: It should be called once.
+    timeBeginPeriod(1);
+    m_handle = CreateWaitableTimerEx(
+            NULL, NULL, disable_high_resolution_timer ? 0 : CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, TIMER_ALL_ACCESS);
+}
+
+void WinTimer::wait(std::chrono::microseconds time) {
+    LARGE_INTEGER li;
+    using ns_t = std::chrono::nanoseconds;
+    using ns_100_t = std::chrono::duration<ns_t::rep, std::ratio_multiply<std::ratio<100>, ns_t::period>>;
+
+    li.QuadPart = -std::chrono::duration_cast<ns_100_t>(time).count();
+    if (!SetWaitableTimer(m_handle, &li, 0, NULL, NULL, false)) {
+        CloseHandle(m_handle);
+        throw std::logic_error("WinTimer failed to setup");
+    }
+
+    if (WaitForSingleObject(m_handle, INFINITE) != WAIT_OBJECT_0) {
+        CloseHandle(m_handle);
+        throw std::logic_error("WinTimer failed to sleep");
+    }
+}
+
+WinTimer::~WinTimer() {
+    CancelWaitableTimer(m_handle);
+    CloseHandle(m_handle);
+}
+
+#endif  // defined(_WIN32)
+
+class ChronoTimer : public SleepTimer {
+    void wait(std::chrono::microseconds time) override;
+};
+
+void ChronoTimer::wait(std::chrono::microseconds time) {
+    std::this_thread::sleep_for(time);
+}
+
+SleepTimer::Ptr SleepTimer::create(bool disable_high_resolution_timer) {
+#if defined(_WIN32)
+    return std::make_shared<WinTimer>(disable_high_resolution_timer);
+#else
+    return std::make_shared<ChronoTimer>();
+#endif
+}
+
+void BusyTimer::wait(std::chrono::microseconds time) {
+    utils::busyWait(time);
+}
diff --git a/src/plugins/intel_npu/tools/protopipe/src/utils/timer.hpp b/src/plugins/intel_npu/tools/protopipe/src/utils/timer.hpp
new file mode 100644
index 00000000000000..423966ad2300a9
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/utils/timer.hpp
@@ -0,0 +1,25 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <chrono>
+#include <functional>
+#include <memory>
+
+struct IWaitable {
+    using Ptr = std::shared_ptr<IWaitable>;
+    virtual void wait(std::chrono::microseconds time) = 0;
+    virtual ~IWaitable() = default;
+};
+
+struct SleepTimer : public IWaitable {
+    using Ptr = std::shared_ptr<SleepTimer>;
+    static Ptr create(bool disable_high_resolution_timer = false);
+};
+
+struct BusyTimer : public IWaitable {
+    void wait(std::chrono::microseconds time) override;
+};
diff --git a/src/plugins/intel_npu/tools/protopipe/src/utils/utils.cpp b/src/plugins/intel_npu/tools/protopipe/src/utils/utils.cpp
new file mode 100644
index 00000000000000..94081dd295229e
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/utils/utils.cpp
@@ -0,0 +1,84 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "utils.hpp"
+
+#include <opencv2/gapi/own/assert.hpp>
+
+#include <fstream>
+
+namespace utils {
+
+void createNDMat(cv::Mat& mat, const std::vector<int>& dims, int depth) {
+    GAPI_Assert(!dims.empty());
+    mat.create(dims, depth);
+    if (dims.size() == 1) {
+        // FIXME: Well-known 1D mat WA
+        mat.dims = 1;
+    }
+}
+
+void generateRandom(cv::Mat& out) {
+    switch (out.depth()) {
+    case CV_8U:
+        cv::randu(out, 0, 255);
+        break;
+    case CV_32S:
+        cv::randu(out, 0, 255);
+        break;
+    case CV_32F:
+        cv::randu(out, 0.f, 255.f);
+        break;
+    case CV_16F: {
+        std::vector<int> dims;
+        for (int i = 0; i < out.size.dims(); ++i) {
+            dims.push_back(out.size[i]);
+        }
+        cv::Mat fp32_mat;
+        createNDMat(fp32_mat, dims, CV_32F);
+        cv::randu(fp32_mat, 0.f, 255.f);
+        fp32_mat.convertTo(out, out.type());
+        break;
+    }
+    default:
+        throw std::logic_error("Unsupported preprocessing depth");
+    }
+}
+
+cv::Mat createRandom(const std::vector<int>& dims, int depth) {
+    cv::Mat mat;
+    createNDMat(mat, dims, depth);
+    generateRandom(mat);
+    return mat;
+}
+
+void readFromBinFile(const std::string& filepath, cv::Mat& mat) {
+    std::ifstream ifs(filepath, std::ios::binary | std::ios::ate);
+
+    if (!ifs.is_open()) {
+        throw std::logic_error("Failed to open: " + filepath);
+    }
+
+    const auto file_byte_size = ifs.tellg();
+    ifs.seekg(0, std::ios::beg);
+
+    const auto mat_byte_size = mat.total() * mat.elemSize();
+    if (file_byte_size != mat_byte_size) {
+        throw std::logic_error("Failed to read cv::Mat from binary file: " + filepath + ". Mat size: " +
+                               std::to_string(mat_byte_size) + ", File size: " + std::to_string(file_byte_size));
+    }
+
+    ifs.read(mat.ptr<char>(), mat_byte_size);
+}
+
+void writeToBinFile(const std::string& filepath, const cv::Mat& mat) {
+    std::ofstream fout(filepath, std::ios::out | std::ios::binary);
+    if (!fout.is_open()) {
+        throw std::logic_error("Failed to open/create: " + filepath);
+    }
+    fout.write(mat.ptr<const char>(), mat.total() * mat.elemSize());
+}
+
+}  // namespace utils
diff --git a/src/plugins/intel_npu/tools/protopipe/src/utils/utils.hpp b/src/plugins/intel_npu/tools/protopipe/src/utils/utils.hpp
new file mode 100644
index 00000000000000..a2ee4bdcf742d5
--- /dev/null
+++ b/src/plugins/intel_npu/tools/protopipe/src/utils/utils.hpp
@@ -0,0 +1,65 @@
+//
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <map>
+#include <numeric>
+#include <thread>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <opencv2/gapi/own/assert.hpp>
+
+namespace utils {
+
+void createNDMat(cv::Mat& mat, const std::vector<int>& dims, int depth);
+void generateRandom(cv::Mat& out);
+cv::Mat createRandom(const std::vector<int>& dims, int depth);
+
+template <typename duration_t>
+typename duration_t::rep measure(std::function<void()> f) {
+    using namespace std::chrono;
+    auto start = high_resolution_clock::now();
+    f();
+    return duration_cast<duration_t>(high_resolution_clock::now() - start).count();
+}
+
+template <typename duration_t>
+typename duration_t::rep timestamp() {
+    using namespace std::chrono;
+    auto now = high_resolution_clock::now();
+    return duration_cast<duration_t>(now.time_since_epoch()).count();
+}
+
+inline void busyWait(std::chrono::microseconds delay) {
+    auto start_ts = timestamp<std::chrono::microseconds>();
+    auto end_ts = start_ts;
+    auto time_to_wait = delay.count();
+
+    while (end_ts - start_ts < time_to_wait) {
+        end_ts = timestamp<std::chrono::microseconds>();
+    }
+}
+
+template <typename T>
+double avg(const std::vector<T>& vec) {
+    return std::accumulate(vec.begin(), vec.end(), 0.0) / vec.size();
+}
+
+template <typename T>
+T max(const std::vector<T>& vec) {
+    return *std::max_element(vec.begin(), vec.end());
+}
+
+template <typename T>
+T min(const std::vector<T>& vec) {
+    return *std::min_element(vec.begin(), vec.end());
+}
+
+void readFromBinFile(const std::string& filepath, cv::Mat& mat);
+void writeToBinFile(const std::string& filepath, const cv::Mat& mat);
+
+}  // namespace utils
diff --git a/src/plugins/intel_npu/tools/single-image-test/CMakeLists.txt b/src/plugins/intel_npu/tools/single-image-test/CMakeLists.txt
index 09ed0db315785c..e6c24566777d4b 100644
--- a/src/plugins/intel_npu/tools/single-image-test/CMakeLists.txt
+++ b/src/plugins/intel_npu/tools/single-image-test/CMakeLists.txt
@@ -26,7 +26,7 @@ foreach(LIB opencv_core opencv_imgproc opencv_imgcodecs)
 endforeach()
 
 if(NOT MISSING_DEPENDENCIES STREQUAL "")
-    message(WARNING "${TARGET_NAME} tool is disabled due to missing dependencies: ${MISSING_DEPENDENCIES}")
+    message(STATUS "NPU ${TARGET_NAME} tool is disabled due to missing dependencies: ${MISSING_DEPENDENCIES}")
     return()
 endif()
 

From 03c9ae38292a90ecb5cbfe2c8d5472eed0ec1aa9 Mon Sep 17 00:00:00 2001
From: Eddy Kim <eddy.kim@intel.com>
Date: Fri, 18 Oct 2024 22:35:26 +0900
Subject: [PATCH 22/32] [GPU] Removed redundant part for dump file name
 (#27123)

### Details:
 - Fixed dump file name to not have layer ID twice.
---
 src/plugins/intel_gpu/src/graph/debug_helper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/graph/debug_helper.cpp b/src/plugins/intel_gpu/src/graph/debug_helper.cpp
index 7f7071e704683e..c2c41fdfab2373 100644
--- a/src/plugins/intel_gpu/src/graph/debug_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/debug_helper.cpp
@@ -295,7 +295,7 @@ NodeDebugHelper::NodeDebugHelper(const primitive_inst& inst)
             debug_config->dump_layers_dst_only == 0 && debug_config->is_layer_for_dumping(layer_name)) {
             std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + layer_name + ":";
             for (size_t i = 0; i < m_inst.dependencies().size(); i++) {
-                std::string name = get_file_prefix() + layer_name + "_src" + std::to_string(i);
+                std::string name = get_file_prefix() + "_src" + std::to_string(i);
                 auto input_mem = m_inst.dep_memory_ptr(i);
                 if (input_mem == nullptr) {
                     GPU_DEBUG_COUT  << " input_mem_" << i << " is nullptr. Nothing to dump." << std::endl;

From 373cf9083fdba52f6bea94a5c89f279c8ce99f35 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Fri, 18 Oct 2024 15:49:37 +0200
Subject: [PATCH 23/32] [GHA] Enable thread sanitizer (#26634)

### Details:
 - Enable thread sanitizer
 - Fixed protobuf build

### Tickets:
 - *ticket-id*
---
 .github/workflows/linux_sanitizers.yml        | 234 +++++++++---------
 .../compile_flags/sanitizer.cmake             |  10 +-
 .../frontend/tensorflow/node_context.hpp      |   3 +-
 tests/{ => sanitizers}/asan/ignore.txt        |   0
 tests/{ => sanitizers}/asan/suppressions.supp |   0
 tests/{ => sanitizers}/lsan/suppressions.txt  |   0
 tests/sanitizers/tsan/suppressions.txt        |  15 ++
 thirdparty/dependencies.cmake                 |   9 +
 8 files changed, 155 insertions(+), 116 deletions(-)
 rename tests/{ => sanitizers}/asan/ignore.txt (100%)
 rename tests/{ => sanitizers}/asan/suppressions.supp (100%)
 rename tests/{ => sanitizers}/lsan/suppressions.txt (100%)
 create mode 100644 tests/sanitizers/tsan/suppressions.txt

diff --git a/.github/workflows/linux_sanitizers.yml b/.github/workflows/linux_sanitizers.yml
index e1a71fe92dc1a3..f13f3765d4f353 100644
--- a/.github/workflows/linux_sanitizers.yml
+++ b/.github/workflows/linux_sanitizers.yml
@@ -1,10 +1,9 @@
-name: Linux Sanitizers (Ubuntu 20.04, Python 3.11)
+name: Linux Sanitizers (Ubuntu 20.04, Python 3.9)
 on:
   schedule:
     # run daily at 00:00
     - cron: '0 0 * * *'
   workflow_dispatch:
-  # pull_request:
 
 concurrency:
   # github.ref is not unique in post-commit
@@ -14,22 +13,69 @@ concurrency:
 permissions: read-all
 
 env:
-  PIP_CACHE_PATH: /mount/caches/pip/linux
-  PYTHON_VERSION: '3.11'
   TARGET_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }}
 
 jobs:
+  Smart_CI:
+    runs-on: ubuntu-latest
+    outputs:
+      affected_components: "${{ steps.smart_ci.outputs.affected_components }}"
+      changed_components: "${{ steps.smart_ci.outputs.changed_components }}"
+      skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}"
+    steps:
+      - name: checkout action
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          sparse-checkout: .github/actions/smart-ci
+
+      - name: Get affected components
+        id: smart_ci
+        uses: ./.github/actions/smart-ci
+        with:
+          repository: ${{ github.repository }}
+          pr: ${{ github.event.number }}
+          commit_sha: ${{ github.sha }}
+          ref_name: ${{ github.ref_name }}
+          component_pattern: "category: (.*)"
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          skip_when_only_listed_labels_set: 'docs'
+          skip_when_only_listed_files_changed: '*.md,*.rst,*.png,*.jpg,*.svg,*/layer_tests_summary/*,*/conformance/*'  
+  
+  Docker:
+    needs: Smart_CI
+    runs-on: aks-linux-4-cores-16gb-docker-build
+    container:
+      image: openvinogithubactions.azurecr.io/docker_build:0.2
+      volumes:
+        - /mount:/mount
+    outputs:
+      images: "${{ steps.handle_docker.outputs.images }}"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+
+      - uses: ./.github/actions/handle_docker
+        id: handle_docker
+        with:
+          images: |
+            ov_build/ubuntu_22_04_x64
+          registry: 'openvinogithubactions.azurecr.io'
+          dockerfiles_root_dir: '.github/dockerfiles'
+          changed_components: ${{ needs.smart_ci.outputs.changed_components }}
+  
   Build:
+    needs: [Smart_CI, Docker]
     timeout-minutes: 500
     defaults:
       run:
         shell: bash
-    runs-on: aks-linux-16-cores-32gb
+    runs-on: aks-linux-16-cores-64gb
     if: ${{ github.repository_owner == 'openvinotoolkit' }}
     container:
-      image: openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04
+      image: ${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_22_04_x64 }}
       volumes:
         - /mount:/mount
+      options: -e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING
     strategy:
       max-parallel: 3
       fail-fast: false
@@ -39,10 +85,9 @@ jobs:
             SANITIZER_CMAKE_OPTION: '-DENABLE_SANITIZER=ON'
           - SANITIZER: 'UndefinedBehavior'
             SANITIZER_CMAKE_OPTION: '-DENABLE_UB_SANITIZER=ON'
-#          - SANITIZER: 'Thread'  # Problems with protobuf
-#            SANITIZER_CMAKE_OPTION: '-DENABLE_THREAD_SANITIZER=ON'
+          - SANITIZER: 'Thread'
+            SANITIZER_CMAKE_OPTION: '-DENABLE_THREAD_SANITIZER=ON'
     env:
-      DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
       CMAKE_BUILD_TYPE: 'RelWithDebInfo'
       CMAKE_GENERATOR: 'Ninja'
       GITHUB_WORKSPACE: '/__w/openvino/openvino'
@@ -51,20 +96,16 @@ jobs:
       INSTALL_DIR: /__w/openvino/openvino/openvino_install
       INSTALL_TEST_DIR: /__w/openvino/openvino/tests_install
       BUILD_DIR: /__w/openvino/openvino/openvino_build
-      LSAN_IGNORE: /__w/openvino/openvino/openvino/tests/lsan/suppressions.txt
-      ASAN_IGNORE: /__w/openvino/openvino/openvino/tests/asan/suppressions.supp
-      CXX: clang++
-      CC: clang
+      CMAKE_CXX_COMPILER_LAUNCHER: sccache
+      CMAKE_C_COMPILER_LAUNCHER: sccache
+      SCCACHE_IGNORE_SERVER_IO_ERROR: 1
+      SCCACHE_SERVER_PORT: 35555
+      SCCACHE_ERROR_LOG: /__w/openvino/sccache_log.txt
+      SCCACHE_LOG: warn
+      SCCACHE_AZURE_KEY_PREFIX: sanitizers_lin_${{ matrix.SANITIZER }}_master
+      SCCACHE_CACHE_SIZE: 50G
 
     steps:
-      - name: Set apt retries
-        run: echo 'Acquire::Retries "10";' > /etc/apt/apt.conf.d/80-retries
-
-      - name: Install git
-        run: |
-          apt-get update
-          apt-get install --assume-yes --no-install-recommends git ca-certificates
-
       - name: Clone OpenVINO
         uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
         with:
@@ -79,31 +120,9 @@ jobs:
           submodules: 'true'
           ref: ${{ env.TARGET_BRANCH }}
 
-      #
-      # Print system info
-      #
-
       - name: System info
         uses: ./openvino/.github/actions/system_info
-
-      #
-      # Dependencies
-      #
-
-      - name: Install build dependencies
-        run: |
-          bash ${OPENVINO_REPO}/install_build_dependencies.sh
-          apt --assume-yes install clang lld
-
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
-        uses: ./openvino/.github/actions/setup_python
-        with:
-          version: ${{ env.PYTHON_VERSION }}
-          pip-cache-path: ${{ env.PIP_CACHE_PATH }}
-          should-setup-pip-paths: 'true'
-          self-hosted-runner: 'true'
-          show-cache-info: 'true'
-
+                 
       - name: Install python dependencies
         run: |
           # For Python API: build and wheel packaging
@@ -120,17 +139,15 @@ jobs:
 
           # For running Paddle frontend unit tests
           python3 -m pip install -r ${OPENVINO_REPO}/src/frontends/paddle/tests/requirements.txt
-
+  
       #
       # Build
       #
-
+      - name: Clean sccache stats
+        run: ${SCCACHE_PATH} --zero-stats
+             
       - name: CMake configure - OpenVINO
         run: |
-          export ASAN_OPTIONS=halt_on_error=0:suppressions=${ASAN_IGNORE}
-          export LSAN_OPTIONS=suppressions=${LSAN_IGNORE}:NEOReadDebugKeys=1:DisableDeepBind=1
-          export CC=clang
-          export CXX=clang++
           cmake \
             -G "${{ env.CMAKE_GENERATOR }}" \
             -DENABLE_CPPLINT=OFF \
@@ -147,24 +164,26 @@ jobs:
             -DENABLE_OV_PYTORCH_FRONTEND=ON \
             -DENABLE_OV_JAX_FRONTEND=ON \
             -DENABLE_OV_ONNX_FRONTEND=ON \
+            -DENABLE_INTEL_NPU=OFF \
             -DENABLE_ONEDNN_FOR_GPU=OFF \
             -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \
+            -DENABLE_WHEEL=OFF \
             ${{ matrix.SANITIZER_CMAKE_OPTION }} \
             -S ${OPENVINO_REPO} \
             -B ${BUILD_DIR}
 
       - name: Cmake build - OpenVINO
         run: |
-          export ASAN_OPTIONS=halt_on_error=0:suppressions=${ASAN_IGNORE}
-          export LSAN_OPTIONS=suppressions=${LSAN_IGNORE}:NEOReadDebugKeys=1:DisableDeepBind=1
-          cmake --build ${BUILD_DIR} --parallel --config ${{ env.CMAKE_BUILD_TYPE }}
-
+          cmake --build ${BUILD_DIR} --parallel $(nproc) --config ${{ env.CMAKE_BUILD_TYPE }}
+          
+      - name: Show sccache stats
+        run: ${SCCACHE_PATH} --show-stats
+        
       - name: Cmake install - OpenVINO
         run: |
           cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -P ${BUILD_DIR}/cmake_install.cmake --config ${{ env.CMAKE_BUILD_TYPE }}
           cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_TEST_DIR} -DCOMPONENT=tests -P ${BUILD_DIR}/cmake_install.cmake --config ${{ env.CMAKE_BUILD_TYPE }}
-          cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DCOMPONENT=python_wheels -P ${BUILD_DIR}/cmake_install.cmake --config ${{ env.CMAKE_BUILD_TYPE }}
-
+          
       - name: Remove unused files to free space
         run: rm -rf ${BUILD_DIR}/*
 
@@ -185,6 +204,13 @@ jobs:
       #
       # Upload build artifacts
       #
+      - name: Upload sccache log
+        if: ${{ always() }}
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: sccache_log_${{ matrix.SANITIZER }}
+          path: ${{ env.SCCACHE_ERROR_LOG }}
+          if-no-files-found: 'error'
 
       - name: Upload openvino package
         if: ${{ always() }}
@@ -205,11 +231,11 @@ jobs:
   CXX_Unit_Tests:
     name: C++ unit tests
     if: ${{ github.repository_owner == 'openvinotoolkit' }}
-    needs: Build
-    timeout-minutes: 100
-    runs-on: 'aks-linux-16-cores-32gb'
+    needs: [Docker, Build]
+    timeout-minutes: 120
+    runs-on: aks-linux-16-cores-32gb
     container:
-      image: 'openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04'
+      image: ${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_22_04_x64 }}
     defaults:
       run:
         shell: bash
@@ -220,18 +246,18 @@ jobs:
         include:
           - SANITIZER: 'AddressAndLeak'
           - SANITIZER: 'UndefinedBehavior'
-#          - SANITIZER: 'Thread'  # Problems with protobuf at the Build stage
+          - SANITIZER: 'Thread'
     env:
-      DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
-      INSTALL_DIR: ${{ github.workspace }}/install
-      INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests
+      OPENVINO_REPO: /__w/openvino/openvino/openvino
+      INSTALL_DIR: /__w/openvino/openvino/install
+      INSTALL_TEST_DIR: /__w/openvino/openvino/install/tests
+      BUILD_DIR: /__w/openvino/openvino/openvino_build
       TBB_ENABLE_SANITIZERS: 1
-      CC: clang
-      CXX: clang++
+      ASAN_OPTIONS: halt_on_error=0:suppressions=/__w/openvino/openvino/openvino/tests/sanitizers/asan/suppressions.supp
+      LSAN_OPTIONS: suppressions=/__w/openvino/openvino/openvino/tests/sanitizers/lsan/suppressions.txt:NEOReadDebugKeys=1:DisableDeepBind=1
+      TSAN_OPTIONS: suppressions=/__w/openvino/openvino/openvino/tests/sanitizers/tsan/suppressions.txt
+      
     steps:
-      - name: Set apt retries
-        run: echo 'Acquire::Retries "10";' > /etc/apt/apt.conf.d/80-retries
-
       - name: Download OpenVINO package
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
@@ -244,16 +270,6 @@ jobs:
           name: ${{ format('openvino_tests_{0}', matrix.SANITIZER) }}
           path: ${{ env.INSTALL_TEST_DIR }}
 
-      # Needed as ${{ github.workspace }} is not working correctly when using Docker
-      - name: Setup Variables
-        continue-on-error: true
-        run: |
-          echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
-          echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
-
-          echo "ASAN_OPTIONS=halt_on_error=0:suppressions=$GITHUB_WORKSPACE/openvino/tests/asan/suppressions.supp" >> "$GITHUB_ENV"
-          echo "LSAN_OPTIONS=suppressions=$GITHUB_WORKSPACE/openvino/tests/lsan/suppressions.txt:NEOReadDebugKeys=1:DisableDeepBind=1" >> "$GITHUB_ENV"
-
       - name: Extract OpenVINO packages
         run: |
           pushd $INSTALL_DIR
@@ -263,77 +279,71 @@ jobs:
             pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
-      - name: Install dependencies (Linux)
-        run: |
-          $INSTALL_DIR/install_dependencies/install_openvino_dependencies.sh -c=core -c=dev -c=gpu -y
-          apt update && apt --assume-yes install clang lld
-
       - name: Fetch Sanitizer Suppression Lists
         uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
         with:
           sparse-checkout: |
-            tests/lsan/suppressions.txt
-            tests/asan/suppressions.supp
+            tests/sanitizers/lsan/suppressions.txt
+            tests/sanitizers/asan/suppressions.supp
+            tests/sanitizers/tsan/suppressions.txt
           sparse-checkout-cone-mode: false
-          path: 'openvino'
+          path: ${{ env.OPENVINO_REPO }}
 
       #
       # Tests
       #
 
       - name: OpenVINO Core Unit Tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_core_unit_tests --gtest_print_time=1 --gtest_filter=-*IE_GPU* \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-OVCoreUT.xml
 
       - name: OpenVINO Inference Functional Tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
-
           ${INSTALL_TEST_DIR}/ov_inference_functional_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-InferenceFunc.xml
 
       - name: OpenVINO Inference Unit Tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_inference_unit_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-InferenceUnit.xml
 
       - name: Low Precision Transformations Tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
-
           ${INSTALL_TEST_DIR}/ov_lp_transformations_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-LpTransformations.xml
 
       - name: OpenVINO Conditional compilation tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_conditional_compilation_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-ConditionalCompilation.xml
 
       - name: IR frontend tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_ir_frontend_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-IRFrontend.xml
 
       - name: PaddlePaddle frontend tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/paddle_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-PaddleTests.xml
 
       - name: ONNX frontend tests
-        if: always()
+        if: ${{ !cancelled() && matrix.SANITIZER != 'Thread' }} # Ticket: 155291
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_onnx_frontend_tests --gtest_print_time=1 \
@@ -341,14 +351,14 @@ jobs:
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-ONNXFrontend.xml
 
       - name: TensorFlow Common frontend tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_tensorflow_common_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-TensorFlowCommonFrontend.xml
 
       - name: TensorFlow frontend tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
 
@@ -371,56 +381,56 @@ jobs:
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-Transformations.xml
 
       - name: Common test utils tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_util_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-CommonUtilTests.xml
 
       - name: Snippets func tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_snippets_func_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-SnippetsFuncTests.xml
 
       - name: CPU plugin unit tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_cpu_unit_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-CPUUnitTests.xml
 
       - name: ov_subgraphs_dumper_tests tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_subgraphs_dumper_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-ov_subgraphs_dumper_tests.xml
 
       - name: Template OpImpl tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_op_conformance_tests --gtest_print_time=1 --device=TEMPLATE --gtest_filter=*OpImpl*\
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-OpImplTests.xml
 
       - name: AUTO unit tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_auto_unit_tests --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-ov_auto_unit_tests.xml
 
       - name: AUTO func Tests
-        if: always()
+        if: ${{ 'false' }} # Issue 155210
         run: |
           source ${{ env.INSTALL_DIR }}/setupvars.sh
           ${{ env.INSTALL_TEST_DIR }}/ov_auto_func_tests --gtest_print_time=1 \
                 --gtest_output=xml:${{ env.INSTALL_TEST_DIR }}/TEST-ov_auto_func_tests.xml
 
       - name: Template plugin func tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_template_func_tests --gtest_print_time=1 \
@@ -428,32 +438,32 @@ jobs:
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-TemplateFuncTests.xml
 
       - name: OpenVINO C API tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_capi_test --gtest_print_time=1 \
                 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-OpenVINOCAPITests.xml
 
       - name: AutoBatch unit tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_auto_batch_unit_tests --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-ov_auto_batch_unit_tests.xml
 
       - name: AutoBatch func tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_auto_batch_func_tests --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-ov_auto_batch_func_tests.xml --gtest_filter="*smoke*"
 
       - name: Proxy Plugin func tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${INSTALL_DIR}/setupvars.sh
           ${INSTALL_TEST_DIR}/ov_proxy_plugin_tests --gtest_print_time=1 --gtest_output=xml:${INSTALL_TEST_DIR}/TEST-OVProxyTests.xml
 
       - name: Hetero unit tests
-        if: always()
+        if: ${{ !cancelled() }}
         run: |
           source ${{ env.INSTALL_DIR }}/setupvars.sh
           ${{ env.INSTALL_TEST_DIR }}/ov_hetero_unit_tests --gtest_print_time=1 --gtest_output=xml:${{ env.INSTALL_TEST_DIR }}/TEST-OVHeteroUnitTests.xml
diff --git a/cmake/developer_package/compile_flags/sanitizer.cmake b/cmake/developer_package/compile_flags/sanitizer.cmake
index 73f109d726c88b..5fc24c4f862239 100644
--- a/cmake/developer_package/compile_flags/sanitizer.cmake
+++ b/cmake/developer_package/compile_flags/sanitizer.cmake
@@ -17,7 +17,7 @@ if (ENABLE_SANITIZER)
             "https://github.com/openvinotoolkit/openvino/wiki/AddressSanitizer-and-LeakSanitizer")
         endif()
     elseif(OV_COMPILER_IS_CLANG)
-        set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize=address -fsanitize-blacklist=${OpenVINO_SOURCE_DIR}/tests/asan/ignore.txt")
+        set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize=address -fsanitize-blacklist=${OpenVINO_SOURCE_DIR}/tests/sanitizers/asan/ignore.txt")
         if(BUILD_SHARED_LIBS)
             set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -shared-libasan")
         endif()
@@ -27,7 +27,7 @@ if (ENABLE_SANITIZER)
             set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize-recover=address")
         endif()
 
-        set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fsanitize=address -fsanitize-blacklist=${OpenVINO_SOURCE_DIR}/tests/asan/ignore.txt")
+        set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fsanitize=address -fsanitize-blacklist=${OpenVINO_SOURCE_DIR}/tests/sanitizers/asan/ignore.txt")
         if(BUILD_SHARED_LIBS)
             set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -shared-libasan")
         endif()
@@ -89,7 +89,11 @@ if(ENABLE_THREAD_SANITIZER)
         message(FATAL_ERROR "Thread sanitizer is not supported in Windows with MSVC compiler. Please, use clang-cl or mingw")
     elseif(CMAKE_COMPILER_IS_GNUCXX OR OV_COMPILER_IS_CLANG)
         set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize=thread")
-        set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fsanitize=thread")
+        if(OV_COMPILER_IS_CLANG)
+            set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -ltsan")
+        else()
+            set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fsanitize=thread")
+        endif()
     else()
         message(WARNING "Unsupported CXX compiler ${CMAKE_CXX_COMPILER_ID}")
     endif()
diff --git a/src/frontends/tensorflow/include/openvino/frontend/tensorflow/node_context.hpp b/src/frontends/tensorflow/include/openvino/frontend/tensorflow/node_context.hpp
index f2dba04b49dca7..c23890b90dcab4 100644
--- a/src/frontends/tensorflow/include/openvino/frontend/tensorflow/node_context.hpp
+++ b/src/frontends/tensorflow/include/openvino/frontend/tensorflow/node_context.hpp
@@ -8,6 +8,7 @@
 #include "exception.hpp"
 #include "openvino/core/any.hpp"
 #include "openvino/frontend/node_context.hpp"
+#include "openvino/frontend/tensorflow/visibility.hpp"
 #include "variable.hpp"
 #include "variables_map.hpp"
 
@@ -18,7 +19,7 @@ class TranslateSession;
 
 /// Keep necessary data for a single node in the original FW graph to facilitate
 /// conversion process in the rules code.
-class NodeContext : public ov::frontend::NodeContext {
+class TENSORFLOW_API NodeContext : public ov::frontend::NodeContext {
 public:
     using Ptr = std::shared_ptr<NodeContext>;
     NodeContext(const std::shared_ptr<DecoderBase>& decoder,
diff --git a/tests/asan/ignore.txt b/tests/sanitizers/asan/ignore.txt
similarity index 100%
rename from tests/asan/ignore.txt
rename to tests/sanitizers/asan/ignore.txt
diff --git a/tests/asan/suppressions.supp b/tests/sanitizers/asan/suppressions.supp
similarity index 100%
rename from tests/asan/suppressions.supp
rename to tests/sanitizers/asan/suppressions.supp
diff --git a/tests/lsan/suppressions.txt b/tests/sanitizers/lsan/suppressions.txt
similarity index 100%
rename from tests/lsan/suppressions.txt
rename to tests/sanitizers/lsan/suppressions.txt
diff --git a/tests/sanitizers/tsan/suppressions.txt b/tests/sanitizers/tsan/suppressions.txt
new file mode 100644
index 00000000000000..0814ce119a0d1d
--- /dev/null
+++ b/tests/sanitizers/tsan/suppressions.txt
@@ -0,0 +1,15 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# This is a ThreadSanitizer (TSan) suppression file which provides a default
+# configuration for the builds with -DENABLE_SANITIZER=ON.
+# More information about LSan suppressions on
+# https://github.com/google/sanitizers/wiki/threadsanitizersuppressions
+
+# race conditions from thirdparty libs
+race:libtbb
+mutex:libtbb
+race:libhwloc
+# race conditions from std libs
+race:libstdc++
+race:libc
\ No newline at end of file
diff --git a/thirdparty/dependencies.cmake b/thirdparty/dependencies.cmake
index 8313ca73178283..0e8536a1714a35 100644
--- a/thirdparty/dependencies.cmake
+++ b/thirdparty/dependencies.cmake
@@ -357,6 +357,15 @@ if(ENABLE_OV_PADDLE_FRONTEND OR ENABLE_OV_ONNX_FRONTEND OR ENABLE_OV_TF_FRONTEND
         endif()
     else()
         add_subdirectory(thirdparty/protobuf EXCLUDE_FROM_ALL)
+        # protobuf fails to build with -fsanitize=thread by clang
+        if(ENABLE_THREAD_SANITIZER AND OV_COMPILER_IS_CLANG)
+            foreach(proto_target protoc libprotobuf libprotobuf-lite)
+                if(TARGET ${proto_target})
+                    target_compile_options(${proto_target} PUBLIC -fno-sanitize=thread)
+                    target_link_options(${proto_target} PUBLIC -fno-sanitize=thread)
+                endif()
+            endforeach()
+        endif()
     endif()
 
     # forward additional variables used in the other places

From 0648cd0c5f5ad3dd91560a731b365acfdfb3c676 Mon Sep 17 00:00:00 2001
From: Maxim Vafin <maxim.vafin@intel.com>
Date: Fri, 18 Oct 2024 18:57:26 +0200
Subject: [PATCH 24/32] [PT FE] Fix sym GPTQ pattern to have consistent graph
 (#27037)

### Details:
 - *Fix sym GPTQ pattern to have consistent graph*

### Tickets:
 - *ticket-id*
---
 .../src/openvino/frontend/pytorch/gptq.py     |  3 ++-
 .../src/transforms/u4_block_repack.cpp        | 23 +++++++++++++++++--
 tests/model_hub_tests/pytorch/test_llm.py     |  2 +-
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/bindings/python/src/openvino/frontend/pytorch/gptq.py b/src/bindings/python/src/openvino/frontend/pytorch/gptq.py
index 3fe1ba465dfd1f..a1c6aecc45d421 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/gptq.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/gptq.py
@@ -77,7 +77,8 @@ def patched_forward_sym(self, *args, **kwargs):
         unpacked_weights, 1, 2).contiguous().view(-1, self.group_size, self.width)
 
     # all zp is 8 for symmetrical, will repack to i4 in pt fe transformation
-    unpacked_weights = unpacked_weights.to(dtype) * self.scales    
+    unpacked_weights = (unpacked_weights.to(torch.int8) - torch.tensor(8, dtype=torch.int8))
+    unpacked_weights = unpacked_weights.to(dtype) * self.scales
     unpacked_weights = unpacked_weights.view(-1, self.width)
 
     out = x @ unpacked_weights
diff --git a/src/frontends/pytorch/src/transforms/u4_block_repack.cpp b/src/frontends/pytorch/src/transforms/u4_block_repack.cpp
index 675a293269002b..5130424d0c60ed 100644
--- a/src/frontends/pytorch/src/transforms/u4_block_repack.cpp
+++ b/src/frontends/pytorch/src/transforms/u4_block_repack.cpp
@@ -7,6 +7,7 @@
 #include "openvino/core/rt_info.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/reshape.hpp"
+#include "openvino/op/subtract.hpp"
 #include "openvino/op/transpose.hpp"
 #include "openvino/pass/pattern/matcher.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
@@ -53,6 +54,7 @@ U4BlockRepack::U4BlockRepack(bool is_symmetrical) {
             auto reshape1 = pattern_to_output[m_reshape1].get_node_shared_ptr();
             auto transpose = pattern_to_output[m_transpose].get_node_shared_ptr();
             auto reshape2 = pattern_to_output[m_reshape2].get_node_shared_ptr();
+            auto pattern_root = reshape2;
 
             if (constant->get_element_type() != element::u4)
                 return false;
@@ -76,9 +78,26 @@ U4BlockRepack::U4BlockRepack(bool is_symmetrical) {
 
             auto get_number = get_u4;
             auto constant_dtype = element::u4;
+            NodeVector copy_from{std::move(constant), std::move(reshape1), std::move(transpose), reshape2};
             if (is_symmetrical) {
                 get_number = get_i4;
                 constant_dtype = element::i4;
+                // find pattern Convert(W, i8) -> Subtract(8)
+                auto reshape_targets = reshape2->output(0).get_target_inputs();
+                if (reshape_targets.size() != 1)
+                    return false;
+                auto convert = reshape_targets.begin()->get_node()->shared_from_this();
+                if (!std::dynamic_pointer_cast<ov::op::v0::Convert>(convert))
+                    return false;
+                auto convert_targets = convert->output(0).get_target_inputs();
+                if (convert_targets.size() != 1)
+                    return false;
+                auto subtract = convert_targets.begin()->get_node()->shared_from_this();
+                if (!std::dynamic_pointer_cast<ov::op::v1::Subtract>(subtract))
+                    return false;
+                pattern_root = subtract;
+                copy_from.push_back(std::move(convert));
+                copy_from.push_back(subtract);
             }
             auto new_const = std::make_shared<v0::Constant>(constant_dtype, destination_shape);
             auto dst = const_cast<uint8_t*>(                                   // const_cast?
@@ -96,8 +115,8 @@ U4BlockRepack::U4BlockRepack(bool is_symmetrical) {
                 }
             }
 
-            copy_runtime_info({std::move(constant), std::move(reshape1), std::move(transpose), reshape2}, new_const);
-            replace_node(reshape2, new_const);
+            copy_runtime_info(copy_from, new_const);
+            replace_node(pattern_root, new_const);
 
             return true;
         });
diff --git a/tests/model_hub_tests/pytorch/test_llm.py b/tests/model_hub_tests/pytorch/test_llm.py
index 9acf8e2100c520..e444f93db9d7ec 100644
--- a/tests/model_hub_tests/pytorch/test_llm.py
+++ b/tests/model_hub_tests/pytorch/test_llm.py
@@ -128,7 +128,7 @@ def load_model(self, name, type):
             example["past_key_values"] = pkv
             example["attention_mask"] = torch.cat(
                 [example["attention_mask"], am], -1)
-        if atype not in ["opt", "falcon", "mbart_gptq", "mpt"]:
+        if atype not in ["opt", "falcon", "mbart", "mpt"]:
             ids = torch.cumsum(example["attention_mask"] != 0, dim=1) - 1
             example["position_ids"] = ids[:, -
                                           example["input_ids"].shape[1]:]

From 8c36c0047377303b0406a74c45195cf29a460b2f Mon Sep 17 00:00:00 2001
From: Ivan Tikhonov <ivan.tikhonov@intel.com>
Date: Fri, 18 Oct 2024 21:57:50 +0400
Subject: [PATCH 25/32] [CORE] Skip unnecessary convert_to_supported_precision
 if ConstantFolding is omitted (#26756)

Details:
It's a modification of
https://github.com/openvinotoolkit/openvino/pull/22674
f16 LLM (llama was tested) compilation time on ARM is unreasonable huge.
Perf report shows that every ConstantFolding transformation takes
several seconds even if the graph is not modified.
The root cause is util::convert_to_supported_precision call even if
constant folding is skipped.
The suggested fix is to skip util::convert_to_supported_precision call
if folding is not applied.

Tickets:
CVS-152428

---------

Co-authored-by: Aleksandr Voron <aleksandr.voron@intel.com>
Co-authored-by: Andrii Staikov <andrii.staikov@intel.com>
---
 src/core/include/openvino/core/node.hpp       |  1 +
 src/core/include/openvino/op/assign.hpp       |  2 +-
 src/core/include/openvino/op/constant.hpp     |  2 +-
 src/core/include/openvino/op/convert_like.hpp |  1 +
 .../include/openvino/op/fake_quantize.hpp     |  3 ++-
 .../include/openvino/op/random_uniform.hpp    |  2 +-
 src/core/include/openvino/op/read_value.hpp   |  2 +-
 src/core/include/openvino/op/reshape.hpp      |  1 +
 src/core/include/openvino/op/result.hpp       |  2 +-
 src/core/include/openvino/op/shape_of.hpp     |  2 ++
 src/core/include/openvino/op/squeeze.hpp      |  1 +
 .../include/openvino/op/strided_slice.hpp     |  1 +
 src/core/include/openvino/op/unsqueeze.hpp    |  1 +
 .../include/openvino/op/util/gather_base.hpp  |  1 +
 src/core/src/node.cpp                         | 14 ++++++++---
 src/core/src/op/assign.cpp                    |  2 +-
 src/core/src/op/constant.cpp                  |  2 +-
 src/core/src/op/convert_like.cpp              |  6 ++++-
 src/core/src/op/random_uniform.cpp            |  2 +-
 src/core/src/op/read_value.cpp                |  2 +-
 src/core/src/op/reshape.cpp                   |  6 ++++-
 src/core/src/op/result.cpp                    |  2 +-
 src/core/src/op/shape_of.cpp                  | 12 ++++++++--
 src/core/src/op/squeeze.cpp                   |  6 ++++-
 src/core/src/op/strided_slice.cpp             |  6 ++++-
 src/core/src/op/unsqueeze.cpp                 |  6 ++++-
 src/core/src/op/util/gather_base.cpp          | 17 +++++++------
 src/core/src/pass/constant_folding.cpp        | 24 ++++++++++++-------
 28 files changed, 90 insertions(+), 39 deletions(-)

diff --git a/src/core/include/openvino/core/node.hpp b/src/core/include/openvino/core/node.hpp
index f5a63911abc502..59a4ab29253ded 100644
--- a/src/core/include/openvino/core/node.hpp
+++ b/src/core/include/openvino/core/node.hpp
@@ -207,6 +207,7 @@ class OPENVINO_API Node : public std::enable_shared_from_this<Node> {
     virtual bool evaluate_upper(ov::TensorVector& output_values) const;
     virtual bool evaluate_symbol(TensorSymbolVector& output_symbols) const;
 
+    virtual bool can_constant_fold(const OutputVector& inputs_values) const;
     virtual bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values);
     /// \brief Decomposes the FusedOp into a sub-graph consisting of core openvino ops
     ///
diff --git a/src/core/include/openvino/op/assign.hpp b/src/core/include/openvino/op/assign.hpp
index c3f8492e54b4f8..895f6619778951 100644
--- a/src/core/include/openvino/op/assign.hpp
+++ b/src/core/include/openvino/op/assign.hpp
@@ -67,7 +67,7 @@ class OPENVINO_API Assign : public util::AssignBase {
                   const TensorVector& inputs,
                   const EvaluationContext& evaluation_context) const override;
     bool has_evaluate() const override;
-    bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 };
 }  // namespace v6
 }  // namespace op
diff --git a/src/core/include/openvino/op/constant.hpp b/src/core/include/openvino/op/constant.hpp
index 62b70a883fc1a5..ccaae01586d612 100644
--- a/src/core/include/openvino/op/constant.hpp
+++ b/src/core/include/openvino/op/constant.hpp
@@ -215,7 +215,7 @@ class OPENVINO_API Constant : public Op {
     bool evaluate_upper(TensorVector& outputs) const override;
 
     // Don't constant fold a constant; it would make a copy
-    bool constant_fold(OutputVector& outputs, const OutputVector& inputs) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 
     /// \brief Returns the value of the constant node as a Shape object
     ///        Can only be used on element::i64 nodes and interprets
diff --git a/src/core/include/openvino/op/convert_like.hpp b/src/core/include/openvino/op/convert_like.hpp
index 244d0f4c7d70b4..0d7f73075e21b9 100644
--- a/src/core/include/openvino/op/convert_like.hpp
+++ b/src/core/include/openvino/op/convert_like.hpp
@@ -27,6 +27,7 @@ class OPENVINO_API ConvertLike : public Op {
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
     bool constant_fold(OutputVector& output_values, const OutputVector& input_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 };
 }  // namespace v1
 }  // namespace op
diff --git a/src/core/include/openvino/op/fake_quantize.hpp b/src/core/include/openvino/op/fake_quantize.hpp
index b47c7016c8709e..52caca885a02cc 100644
--- a/src/core/include/openvino/op/fake_quantize.hpp
+++ b/src/core/include/openvino/op/fake_quantize.hpp
@@ -69,7 +69,8 @@ class OPENVINO_API FakeQuantize : public Op {
 
     bool evaluate(TensorVector& outputs, const TensorVector& inputs) const override;
     bool has_evaluate() const override;
-    bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override {
+
+    bool can_constant_fold(const OutputVector& inputs_values) const override {
         return false;
     }
 
diff --git a/src/core/include/openvino/op/random_uniform.hpp b/src/core/include/openvino/op/random_uniform.hpp
index 6a4de83715e30a..22f06f79402135 100644
--- a/src/core/include/openvino/op/random_uniform.hpp
+++ b/src/core/include/openvino/op/random_uniform.hpp
@@ -42,7 +42,7 @@ class OPENVINO_API RandomUniform : public Op {
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
     /// \return Turns off constant folding for RandomUniform operation.
-    bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 
     /// \return The output tensor type.
     const ov::element::Type& get_out_type() const;
diff --git a/src/core/include/openvino/op/read_value.hpp b/src/core/include/openvino/op/read_value.hpp
index 27447644037211..e37d6baa11c01c 100644
--- a/src/core/include/openvino/op/read_value.hpp
+++ b/src/core/include/openvino/op/read_value.hpp
@@ -80,7 +80,7 @@ class OPENVINO_API ReadValue : public util::ReadValueBase {
                   const EvaluationContext& evaluation_context) const override;
     bool has_evaluate() const override;
 
-    bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 };
 }  // namespace v6
 }  // namespace op
diff --git a/src/core/include/openvino/op/reshape.hpp b/src/core/include/openvino/op/reshape.hpp
index f3a9e7aa8e59c1..48bc08f8c3d947 100644
--- a/src/core/include/openvino/op/reshape.hpp
+++ b/src/core/include/openvino/op/reshape.hpp
@@ -52,6 +52,7 @@ class OPENVINO_API Reshape : public Op {
     bool evaluate_lower(TensorVector& outputs) const override;
     bool evaluate_symbol(TensorSymbolVector& output_symbols) const override;
     bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 
 protected:
     bool m_special_zero;
diff --git a/src/core/include/openvino/op/result.hpp b/src/core/include/openvino/op/result.hpp
index dc8162a10b6627..00e805d1f2aeb5 100644
--- a/src/core/include/openvino/op/result.hpp
+++ b/src/core/include/openvino/op/result.hpp
@@ -30,7 +30,7 @@ class OPENVINO_API Result : public Op {
 
     bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;
     bool has_evaluate() const override;
-    bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 
     /// \brief Returns current layout, or empty Layout if it is not set
     Layout get_layout() const;
diff --git a/src/core/include/openvino/op/shape_of.hpp b/src/core/include/openvino/op/shape_of.hpp
index c8245d91069ed0..375d087f7e6cf8 100644
--- a/src/core/include/openvino/op/shape_of.hpp
+++ b/src/core/include/openvino/op/shape_of.hpp
@@ -38,6 +38,7 @@ class OPENVINO_API ShapeOf : public util::ShapeOfBase {
     bool evaluate_upper(TensorVector& output_values) const override;
     bool evaluate_symbol(TensorSymbolVector& output_symbols) const override;
     bool constant_fold(OutputVector& output_values, const OutputVector& input_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 
 private:
     element::Type m_output_type;
@@ -64,6 +65,7 @@ class OPENVINO_API ShapeOf : public util::ShapeOfBase {
     bool evaluate_upper(TensorVector& output_values) const override;
     bool evaluate_symbol(TensorSymbolVector& output_symbols) const override;
     bool constant_fold(OutputVector& output_values, const OutputVector& input_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 };
 }  // namespace v0
 }  // namespace op
diff --git a/src/core/include/openvino/op/squeeze.hpp b/src/core/include/openvino/op/squeeze.hpp
index f7cb41f974db2f..8c27f29d66df66 100644
--- a/src/core/include/openvino/op/squeeze.hpp
+++ b/src/core/include/openvino/op/squeeze.hpp
@@ -27,6 +27,7 @@ class OPENVINO_API Squeeze : public Op {
     bool evaluate_upper(TensorVector& outputs) const override;
     bool evaluate_symbol(TensorSymbolVector& output_symbols) const override;
     bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
diff --git a/src/core/include/openvino/op/strided_slice.hpp b/src/core/include/openvino/op/strided_slice.hpp
index 2ba4f84c0936bf..aa080bc6563b90 100644
--- a/src/core/include/openvino/op/strided_slice.hpp
+++ b/src/core/include/openvino/op/strided_slice.hpp
@@ -114,6 +114,7 @@ class OPENVINO_API StridedSlice : public Op {
     bool evaluate_upper(TensorVector& outputs) const override;
     bool evaluate_symbol(TensorSymbolVector& output_symbols) const override;
     bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 
 private:
     AxisSet convert_mask_to_axis_set(const std::vector<int64_t>& mask) const;
diff --git a/src/core/include/openvino/op/unsqueeze.hpp b/src/core/include/openvino/op/unsqueeze.hpp
index d9839c7d68d719..4701df2dd4d4ec 100644
--- a/src/core/include/openvino/op/unsqueeze.hpp
+++ b/src/core/include/openvino/op/unsqueeze.hpp
@@ -30,6 +30,7 @@ class OPENVINO_API Unsqueeze : public Op {
     bool evaluate_symbol(TensorSymbolVector& output_symbols) const override;
 
     bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 };
diff --git a/src/core/include/openvino/op/util/gather_base.hpp b/src/core/include/openvino/op/util/gather_base.hpp
index f7846b83cfe465..9fa8387aee6b3a 100644
--- a/src/core/include/openvino/op/util/gather_base.hpp
+++ b/src/core/include/openvino/op/util/gather_base.hpp
@@ -34,6 +34,7 @@ class OPENVINO_API GatherBase : public Op {
     bool evaluate_symbol(TensorSymbolVector& output_symbols) const override;
 
     bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override;
+    bool can_constant_fold(const OutputVector& inputs_values) const override;
     const int64_t& get_batch_dims() const;
     void set_batch_dims(int64_t batch_dims);
 
diff --git a/src/core/src/node.cpp b/src/core/src/node.cpp
index 0341e4477f4cfb..8b9936b5496e7c 100644
--- a/src/core/src/node.cpp
+++ b/src/core/src/node.cpp
@@ -696,8 +696,8 @@ bool ov::Node::evaluate_symbol(TensorSymbolVector& output_symbols) const {
     return false;
 }
 
-bool ov::Node::constant_fold(OutputVector& output_values, const OutputVector& input_values) {
-    OV_ITT_SCOPED_TASK(ov::itt::domains::core, "Node::constant_fold");
+bool ov::Node::can_constant_fold(const OutputVector& input_values) const {
+    OV_ITT_SCOPED_TASK(ov::itt::domains::core, "Node::can_constant_fold");
 
     if (is_const_fold_disabled()) {
         return false;
@@ -707,8 +707,16 @@ bool ov::Node::constant_fold(OutputVector& output_values, const OutputVector& in
     bool all_constants = std::all_of(input_values.begin(), input_values.end(), [](const Output<Node>& input) {
         return ov::as_type_ptr<ov::op::v0::Constant>(input.get_node_shared_ptr());
     });
-    if (!all_constants)
+
+    return all_constants;
+}
+
+bool ov::Node::constant_fold(OutputVector& output_values, const OutputVector& input_values) {
+    OV_ITT_SCOPED_TASK(ov::itt::domains::core, "Node::constant_fold");
+
+    if (!Node::can_constant_fold(input_values)) {
         return false;
+    }
 
     NodeVector nodes;
     TensorVector input_tensors;
diff --git a/src/core/src/op/assign.cpp b/src/core/src/op/assign.cpp
index bf6e55c11b1d39..7798d4328049af 100644
--- a/src/core/src/op/assign.cpp
+++ b/src/core/src/op/assign.cpp
@@ -134,7 +134,7 @@ bool Assign::has_evaluate() const {
     return true;
 }
 
-bool Assign::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) {
+bool Assign::can_constant_fold(const OutputVector& input_values) const {
     return false;
 }
 }  // namespace v6
diff --git a/src/core/src/op/constant.cpp b/src/core/src/op/constant.cpp
index 95df6379ba284e..e06718ef4e1fd5 100644
--- a/src/core/src/op/constant.cpp
+++ b/src/core/src/op/constant.cpp
@@ -663,7 +663,7 @@ bool Constant::evaluate_upper(TensorVector& outputs) const {
     return evaluate(outputs, {});
 }
 
-bool Constant::constant_fold(OutputVector&, const OutputVector&) {
+bool Constant::can_constant_fold(const OutputVector& input_values) const {
     return false;
 }
 
diff --git a/src/core/src/op/convert_like.cpp b/src/core/src/op/convert_like.cpp
index 3dc0159bb556be..4ae4ea982f8cd9 100644
--- a/src/core/src/op/convert_like.cpp
+++ b/src/core/src/op/convert_like.cpp
@@ -29,9 +29,13 @@ std::shared_ptr<Node> ConvertLike::clone_with_new_inputs(const OutputVector& new
     return std::make_shared<ConvertLike>(new_args.at(0), new_args.at(1));
 }
 
+bool ConvertLike::can_constant_fold(const OutputVector& input_values) const {
+    return !is_const_fold_disabled();
+}
+
 bool ConvertLike::constant_fold(OutputVector& output_values, const OutputVector& input_values) {
     OV_OP_SCOPE(v1_ConvertLike_constant_fold);
-    if (is_const_fold_disabled()) {
+    if (!can_constant_fold(input_values)) {
         return false;
     }
 
diff --git a/src/core/src/op/random_uniform.cpp b/src/core/src/op/random_uniform.cpp
index e62be4d26afc58..9aafed881086b6 100644
--- a/src/core/src/op/random_uniform.cpp
+++ b/src/core/src/op/random_uniform.cpp
@@ -88,7 +88,7 @@ std::shared_ptr<Node> RandomUniform::clone_with_new_inputs(const OutputVector& n
 }
 
 /// \return Turns off constant folding for RandomUniform operation.
-bool RandomUniform::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) {
+bool RandomUniform::can_constant_fold(const OutputVector& input_values) const {
     return false;
 }
 
diff --git a/src/core/src/op/read_value.cpp b/src/core/src/op/read_value.cpp
index 162cb5067bc00a..0d63456a3b8348 100644
--- a/src/core/src/op/read_value.cpp
+++ b/src/core/src/op/read_value.cpp
@@ -176,7 +176,7 @@ bool ReadValue::has_evaluate() const {
     return true;
 }
 
-bool ReadValue::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) {
+bool ReadValue::can_constant_fold(const OutputVector& input_values) const {
     return false;
 }
 }  // namespace v6
diff --git a/src/core/src/op/reshape.cpp b/src/core/src/op/reshape.cpp
index ab0e0a0c17cbde..477e210f574269 100644
--- a/src/core/src/op/reshape.cpp
+++ b/src/core/src/op/reshape.cpp
@@ -97,7 +97,7 @@ bool Reshape::evaluate_symbol(TensorSymbolVector& output_symbols) const {
 }
 
 bool Reshape::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) {
-    if (get_output_partial_shape(0).is_dynamic() || is_const_fold_disabled()) {
+    if (!can_constant_fold(inputs_values)) {
         return false;
     }
 
@@ -108,6 +108,10 @@ bool Reshape::constant_fold(OutputVector& output_values, const OutputVector& inp
         return false;
     }
 }
+
+bool Reshape::can_constant_fold(const OutputVector& input_values) const {
+    return get_output_partial_shape(0).is_static() && !is_const_fold_disabled();
+}
 }  // namespace v1
 }  // namespace op
 }  // namespace ov
diff --git a/src/core/src/op/result.cpp b/src/core/src/op/result.cpp
index 3667e5ff22b422..237d6bd7a2084a 100644
--- a/src/core/src/op/result.cpp
+++ b/src/core/src/op/result.cpp
@@ -67,7 +67,7 @@ bool Result::has_evaluate() const {
     return true;
 }
 
-bool Result::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) {
+bool Result::can_constant_fold(const OutputVector& input_values) const {
     return false;
 }
 
diff --git a/src/core/src/op/shape_of.cpp b/src/core/src/op/shape_of.cpp
index 293c1b5fc5a59c..9676a5704ec99c 100644
--- a/src/core/src/op/shape_of.cpp
+++ b/src/core/src/op/shape_of.cpp
@@ -168,9 +168,13 @@ bool ShapeOf::evaluate_symbol(TensorSymbolVector& output_symbols) const {
     return shape_of::evaluate_symbol(this, output_symbols);
 }
 
+bool ShapeOf::can_constant_fold(const OutputVector& input_values) const {
+    return !is_const_fold_disabled() && input_values[0].get_partial_shape().is_static();
+}
+
 bool ShapeOf::constant_fold(OutputVector& output_values, const OutputVector& input_values) {
     OV_OP_SCOPE(v3_ShapeOf_constant_fold);
-    if (is_const_fold_disabled()) {
+    if (!can_constant_fold(input_values)) {
         return false;
     }
     return shape_of::constant_fold_shape_of(this, output_values[0], input_values[0]);
@@ -222,9 +226,13 @@ bool ShapeOf::has_evaluate() const {
     }
 }
 
+bool ShapeOf::can_constant_fold(const OutputVector& input_values) const {
+    return !is_const_fold_disabled() && input_values[0].get_partial_shape().is_static();
+}
+
 bool ShapeOf::constant_fold(OutputVector& output_values, const OutputVector& input_values) {
     OV_OP_SCOPE(v0_ShapeOf_constant_fold);
-    if (is_const_fold_disabled()) {
+    if (!can_constant_fold(input_values)) {
         return false;
     }
     return shape_of::constant_fold_shape_of(this, output_values[0], input_values[0]);
diff --git a/src/core/src/op/squeeze.cpp b/src/core/src/op/squeeze.cpp
index 3abc0a773192d2..1b34a4e48a4faf 100644
--- a/src/core/src/op/squeeze.cpp
+++ b/src/core/src/op/squeeze.cpp
@@ -104,9 +104,13 @@ bool Squeeze::evaluate_symbol(TensorSymbolVector& output_symbols) const {
     return validate::axes_has_and_set_bound(*this) && ov::util::default_symbol_evaluator(this, output_symbols);
 }
 
+bool Squeeze::can_constant_fold(const OutputVector& inputs_values) const {
+    return get_output_partial_shape(0).is_static() && !is_const_fold_disabled();
+}
+
 bool Squeeze::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) {
     OV_OP_SCOPE(v0_Squeeze_constant_fold);
-    if (get_output_partial_shape(0).is_dynamic() || is_const_fold_disabled()) {
+    if (!can_constant_fold(inputs_values)) {
         return false;
     }
 
diff --git a/src/core/src/op/strided_slice.cpp b/src/core/src/op/strided_slice.cpp
index deb89fa9a531d4..83ac3dec7a5f4f 100644
--- a/src/core/src/op/strided_slice.cpp
+++ b/src/core/src/op/strided_slice.cpp
@@ -283,9 +283,13 @@ bool StridedSlice::evaluate_symbol(TensorSymbolVector& output_symbols) const {
            default_symbol_evaluator(this, {0}, output_symbols);
 }
 
+bool StridedSlice::can_constant_fold(const OutputVector& input_values) const {
+    return !is_const_fold_disabled();
+}
+
 bool StridedSlice::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) {
     auto is_folded = Node::constant_fold(output_values, inputs_values);
-    if (!is_const_fold_disabled() && !is_folded) {
+    if (can_constant_fold(inputs_values) && !is_folded) {
         // If all ignored mask are set for all begin or end then replace this input by dummy constant
         // to avoid return false from `could_propagate` during bound evaluation (value of const will be ignored).
         auto get_indices_input = [&inputs_values](size_t port, const std::vector<int64_t>& mask) -> Output<Node> {
diff --git a/src/core/src/op/unsqueeze.cpp b/src/core/src/op/unsqueeze.cpp
index d199c43a2479b5..f8c14a08f70d30 100644
--- a/src/core/src/op/unsqueeze.cpp
+++ b/src/core/src/op/unsqueeze.cpp
@@ -77,8 +77,12 @@ bool ov::op::v0::Unsqueeze::evaluate_symbol(TensorSymbolVector& output_symbols)
     return ov::util::default_symbol_evaluator(this, output_symbols);
 }
 
+bool ov::op::v0::Unsqueeze::can_constant_fold(const OutputVector& input_values) const {
+    return get_output_partial_shape(0).is_static() && !is_const_fold_disabled();
+}
+
 bool ov::op::v0::Unsqueeze::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) {
-    if (get_output_partial_shape(0).is_dynamic() || is_const_fold_disabled()) {
+    if (!can_constant_fold(inputs_values)) {
         return false;
     }
 
diff --git a/src/core/src/op/util/gather_base.cpp b/src/core/src/op/util/gather_base.cpp
index 92e41781b1de55..dd35edf695ec16 100644
--- a/src/core/src/op/util/gather_base.cpp
+++ b/src/core/src/op/util/gather_base.cpp
@@ -32,10 +32,6 @@ Shape out_shape_infer(const Shape& data_shape, const Shape& indices_shape, int64
 bool cf_gather_with_subgraph(OutputVector& output_values,
                              const OutputVector& input_values,
                              const PartialShape& gather_ps) {
-    if (gather_ps.is_dynamic() || input_values.size() != 3) {
-        return false;
-    }
-
     const auto concat = std::dynamic_pointer_cast<v0::Concat>(input_values[0].get_node_shared_ptr());
     const auto indices = std::dynamic_pointer_cast<v0::Constant>(input_values[1].get_node_shared_ptr());
     const auto axis = std::dynamic_pointer_cast<v0::Constant>(input_values[2].get_node_shared_ptr());
@@ -67,7 +63,6 @@ bool cf_gather_with_subgraph(OutputVector& output_values,
     const auto raw_index = indices->cast_vector<int64_t>()[0];
     const auto positive_index = ov::util::normalize(raw_index, rank);
     OPENVINO_ASSERT(positive_index >= 0 && positive_index < rank);
-
     // gather takes exactly one element out of the Concat output
     const auto gathered_concat_input = concat_inputs[positive_index].get_source_output().get_node_shared_ptr();
     // Concat inputs are 1D, resulting tensor shape depends on Gather indices
@@ -77,9 +72,7 @@ bool cf_gather_with_subgraph(OutputVector& output_values,
         const auto axis_const = v0::Constant::create(element::i64, Shape{1}, {0});
         gathered = std::make_shared<v0::Squeeze>(gathered_concat_input, axis_const);
     }
-
     output_values[0] = gathered;
-
     return true;
 }
 
@@ -262,13 +255,19 @@ bool GatherBase::evaluate_symbol(TensorSymbolVector& output_symbols) const {
     return gather::have_indices_and_axis_bound_set(this) && ov::util::default_symbol_evaluator(this, output_symbols);
 }
 
+bool GatherBase::can_constant_fold(const OutputVector& input_values) const {
+    return get_output_partial_shape(0).is_static() && input_values.size() == 3;
+}
+
 bool GatherBase::constant_fold(OutputVector& output_values, const OutputVector& input_values) {
     // try the regular constant folding just for the Gather node
     if (Node::constant_fold(output_values, input_values)) {
         return true;
-    } else {
-        return gather::cf_gather_with_subgraph(output_values, input_values, get_output_partial_shape(0));
     }
+    if (!can_constant_fold(input_values)) {
+        return false;
+    }
+    return gather::cf_gather_with_subgraph(output_values, input_values, get_output_partial_shape(0));
 }
 }  // namespace util
 }  // namespace op
diff --git a/src/core/src/pass/constant_folding.cpp b/src/core/src/pass/constant_folding.cpp
index 3de91829f91b0c..cc1a7cea5b5add 100644
--- a/src/core/src/pass/constant_folding.cpp
+++ b/src/core/src/pass/constant_folding.cpp
@@ -105,6 +105,21 @@ bool ov::pass::ConstantFolding::run_on_model(const std::shared_ptr<ov::Model>& m
 
     for (const auto& original_node : model->get_ordered_ops()) {
         auto node = original_node;
+        if (!original_node->can_constant_fold(original_node->input_values())) {
+            if (auto sub_graph_node = std::dynamic_pointer_cast<ov::op::util::MultiSubGraphOp>(node)) {
+                // recursively constant fold operators containing subgraphs (ie: TensorIterator, Loop)
+                size_t sub_graphs_num = sub_graph_node->get_internal_subgraphs_size();
+                for (size_t sub_graph_ind = 0; sub_graph_ind < sub_graphs_num; ++sub_graph_ind) {
+                    rewritten =
+                        run_on_model(sub_graph_node->get_function(static_cast<int>(sub_graph_ind))) || rewritten;
+                }
+            }
+            rewritten = restore_original_input_precision(original_node) || rewritten;
+            if (rewritten) {
+                original_node->validate_and_infer_types();
+            }
+            continue;
+        }
         if (node_has_requires_precision_conversion_attribute(node)) {
             remove_requires_precision_conversion_attribute(node);
             node = util::convert_to_supported_precision(node.get());
@@ -143,15 +158,6 @@ bool ov::pass::ConstantFolding::run_on_model(const std::shared_ptr<ov::Model>& m
                 }
             }
         } else {
-            if (auto sub_graph_node = std::dynamic_pointer_cast<ov::op::util::MultiSubGraphOp>(node)) {
-                // recursively constant fold operators containing subgraphs (ie: TensorIterator, Loop)
-                size_t sub_graphs_num = sub_graph_node->get_internal_subgraphs_size();
-                for (size_t sub_graph_ind = 0; sub_graph_ind < sub_graphs_num; ++sub_graph_ind) {
-                    rewritten =
-                        run_on_model(sub_graph_node->get_function(static_cast<int>(sub_graph_ind))) || rewritten;
-                }
-            }
-
             // if CF was unsuccessful remove original precision attribute from inputs
             bool restored = restore_original_input_precision(original_node);
             if (restored) {

From 13becaa48b24e12964e849d0f2af71d2f5b854ab Mon Sep 17 00:00:00 2001
From: Maxim Vafin <maxim.vafin@intel.com>
Date: Fri, 18 Oct 2024 20:04:12 +0200
Subject: [PATCH 26/32] [TESTS] Print test names in log (#27121)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 .github/workflows/job_pytorch_layer_tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml
index abf614c70cff4e..c6cd97422f2b95 100644
--- a/.github/workflows/job_pytorch_layer_tests.yml
+++ b/.github/workflows/job_pytorch_layer_tests.yml
@@ -121,7 +121,7 @@ jobs:
       - name: PyTorch Layer Tests
         if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287, 142196
         # due to CVS-152795, parallel run is not possible on Windows
-        run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
+        run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit -v --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP32
@@ -130,7 +130,7 @@ jobs:
       - name: PyTorch torch.export Layer Tests
         if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
         run: |
-          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
+          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit_torch_export -v --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch_export.xml
         env:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP32
@@ -140,7 +140,7 @@ jobs:
       - name: PyTorch torch.compile TORCHFX Layer Tests
         if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
         run: |
-          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -m precommit_fx_backend --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
+          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -m precommit_fx_backend -v --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch_compile.xml
         env:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP32

From f33e25565bbf99208630d959c80921437da36536 Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Fri, 18 Oct 2024 23:54:26 +0400
Subject: [PATCH 27/32] [TFL FE] Export public API symbols for TFLite Delegate
 (#27140)

**Details:** We need to properly export public API in
tensorflow_lite_frontend shared library so that TFLite delagate can
import them. All abstract classes that TFLite Delegate implements on its
own should be exported. `QuantizationInfo` also should be exported to
avoid duplications/re-definitions in binaries.

**Ticket:** TBD

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 .../include/openvino/frontend/tensorflow_lite/decoder.hpp | 8 ++++----
 .../openvino/frontend/tensorflow_lite/graph_iterator.hpp  | 2 +-
 .../frontend/tensorflow_lite/quantization_info.hpp        | 2 +-
 .../openvino/frontend/tensorflow_lite/sparsity_info.hpp   | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/decoder.hpp b/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/decoder.hpp
index b3415cf288c4be..a2cafe16e075fb 100644
--- a/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/decoder.hpp
+++ b/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/decoder.hpp
@@ -14,7 +14,7 @@ namespace ov {
 namespace frontend {
 namespace tensorflow_lite {
 
-struct TensorMetaInfo {
+struct TENSORFLOW_LITE_API TensorMetaInfo {
     std::shared_ptr<QuantizationInfo> m_quantization_info;
     std::shared_ptr<SparsityInfo> m_sparsity_info;
     ov::PartialShape m_partial_shape;
@@ -23,11 +23,11 @@ struct TensorMetaInfo {
     std::string m_tensor_name;
 };
 
-class DecoderBase : public ov::frontend::DecoderBase {};
+class TENSORFLOW_LITE_API DecoderBase : public ov::frontend::DecoderBase {};
 
 // DecoderBaseOperation corresponds to operation node to retrieve its attributes and information about input and output
 // tensors
-class DecoderBaseOperation : public ov::frontend::tensorflow_lite::DecoderBase {
+class TENSORFLOW_LITE_API DecoderBaseOperation : public ov::frontend::tensorflow_lite::DecoderBase {
 public:
     /// \brief Get input tensor name by index
     /// Operation nodes are connected between each other by tensors.
@@ -71,7 +71,7 @@ class DecoderBaseOperation : public ov::frontend::tensorflow_lite::DecoderBase {
 
 // DecoderBaseTensor corresponds to tensor node to retrieve information about type, shapem quantization and sparsity
 // information
-class DecoderBaseTensor : public ov::frontend::tensorflow_lite::DecoderBase {
+class TENSORFLOW_LITE_API DecoderBaseTensor : public ov::frontend::tensorflow_lite::DecoderBase {
 public:
     /// \brief Get tensor info
     virtual TensorMetaInfo get_tensor_info() const = 0;
diff --git a/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/graph_iterator.hpp b/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/graph_iterator.hpp
index 8ec2bc3f05c358..2084147c9ab284 100644
--- a/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/graph_iterator.hpp
+++ b/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/graph_iterator.hpp
@@ -24,7 +24,7 @@ namespace tensorflow_lite {
 /// DecoderBaseOperation (for op 1), ..., DecoderBaseOperation (for op k),
 /// where n - number of inputs in the model, m - number of outputs in the model k - number of operation nodes.
 /// NOTE: constants are ignored and no decoder object is returned for constant.
-class GraphIterator : ::ov::RuntimeAttribute {
+class TENSORFLOW_LITE_API GraphIterator : ::ov::RuntimeAttribute {
 public:
     using Ptr = std::shared_ptr<GraphIterator>;
 
diff --git a/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/quantization_info.hpp b/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/quantization_info.hpp
index bd0f1e28283a27..66977db1caa5d4 100644
--- a/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/quantization_info.hpp
+++ b/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/quantization_info.hpp
@@ -15,7 +15,7 @@ namespace ov {
 namespace frontend {
 namespace tensorflow_lite {
 
-class QuantizationInfo : public ov::RuntimeAttribute {
+class TENSORFLOW_LITE_API QuantizationInfo : public ov::RuntimeAttribute {
 public:
     OPENVINO_RTTI("QuantizationInfo");
     QuantizationInfo() = default;
diff --git a/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/sparsity_info.hpp b/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/sparsity_info.hpp
index 596cb651763d57..c1ab8d4fd04941 100644
--- a/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/sparsity_info.hpp
+++ b/src/frontends/tensorflow_lite/include/openvino/frontend/tensorflow_lite/sparsity_info.hpp
@@ -16,7 +16,7 @@ namespace ov {
 namespace frontend {
 namespace tensorflow_lite {
 
-class SparsityInfo : public ov::RuntimeAttribute {
+class TENSORFLOW_LITE_API SparsityInfo : public ov::RuntimeAttribute {
 public:
     struct SparsityDataDesc {
         uint8_t segments_type;

From c5025cc6ca06753b0cf7091438d28cbac44139d0 Mon Sep 17 00:00:00 2001
From: Andrii Staikov <andrii.staikov@intel.com>
Date: Sat, 19 Oct 2024 14:25:26 +0200
Subject: [PATCH 28/32] [TRANSFORMATIONS] Introduce a new method of testing
 SDPAToPA transformation (#27067)

[TRANSFORMATIONS] Introduce a new method of testing SDPAToPA
transformation

Introduce a new method of testing the SDPAToPA transformation by not
only checking if PagedAttentionExtension nodes appeared in a graph, but
also check if the changes in number of nodes involved in the
transformation aligned with the reference numbers for each model.

Add a script for fast generation of reference values.

Signed-off-by: Andrii Staikov <andrii.staikov@intel.com>

- Tickets:
	* CVS-152290
---
 .../workflows/job_pytorch_models_tests.yml    |   2 +-
 .../generate_ref_diffs.py                     |  94 +++
 .../models/hf-tiny-random-models-precommit    |   5 +-
 .../transformation_tests/sdpa2pa_ref_diff.py  | 612 ++++++++++++++++++
 .../test_pa_transformation.py                 |  30 +-
 5 files changed, 731 insertions(+), 12 deletions(-)
 create mode 100644 tests/model_hub_tests/transformation_tests/generate_ref_diffs.py
 create mode 100644 tests/model_hub_tests/transformation_tests/sdpa2pa_ref_diff.py

diff --git a/.github/workflows/job_pytorch_models_tests.yml b/.github/workflows/job_pytorch_models_tests.yml
index 8f3699f6ab42a2..22a09dffba779f 100644
--- a/.github/workflows/job_pytorch_models_tests.yml
+++ b/.github/workflows/job_pytorch_models_tests.yml
@@ -137,7 +137,7 @@ jobs:
         if: ${{ inputs.model_scope == 'precommit' }}
         run: |
           export PYTHONPATH=${MODEL_HUB_TESTS_INSTALL_DIR}:$PYTHONPATH
-          python3 -m pytest ${MODEL_HUB_TESTS_INSTALL_DIR}/transformation_tests/test_pa_transformation.py -m precommit --html=${INSTALL_TEST_DIR}/TEST-torch_pagedattention_tests.html --self-contained-html -v --tb=short -n 2
+          python3 -m pytest ${MODEL_HUB_TESTS_INSTALL_DIR}/transformation_tests/test_pa_transformation.py -m precommit --html=${INSTALL_TEST_DIR}/TEST-torch_pagedattention_tests.html --self-contained-html -vvv -s --tb=short -n 2
         env:
           TEST_DEVICE: CPU
           USE_SYSTEM_CACHE: False
diff --git a/tests/model_hub_tests/transformation_tests/generate_ref_diffs.py b/tests/model_hub_tests/transformation_tests/generate_ref_diffs.py
new file mode 100644
index 00000000000000..6823256b3ccfc5
--- /dev/null
+++ b/tests/model_hub_tests/transformation_tests/generate_ref_diffs.py
@@ -0,0 +1,94 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+'''
+Use this script if you need to regenerate reference diffs for each model
+to test SDPAToPA transformation.
+
+The script will produce sdpa2pa_ref_diff.txt (or sdpa2pa_ref_diff_cache_eviction.txt
+if using cache-eviction) containing a map in the
+following format with nodes number changes for each model:
+
+ref_diff_map = {
+	"hf-internal-testing/tiny-random-LlamaForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-CohereForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+    .
+    .
+    .
+}
+
+The map has to be pasted into sdpa2pa_ref_diff.py (same directory) for
+includes to test SDPAToPA transformation.
+
+Run the script by using 'python generate_ref_diffs.py' or 'python generate_ref_diffs.py True'
+for generating the same map, but utilizing cache-eviction.
+'''
+
+import os
+import sys
+from pathlib import Path
+import models_hub_common.utils as utils
+from openvino._offline_transformations import paged_attention_transformation
+from openvino._pyopenvino.op import _PagedAttentionExtension, Parameter, Result
+from optimum.intel import OVModelForCausalLM
+
+nodes_to_compare = ("ScaledDotProductAttention", "PagedAttentionExtension", "Parameter", "ReadValue", "Assign")
+
+def main():
+    use_cache_eviction = False
+    if len(sys.argv) >= 2:
+        use_cache_eviction = sys.argv[1].lower() in 'true'
+
+    OUTPUT_FILE = Path(os.path.join(os.path.dirname(__file__)), 'sdpa2pa_ref_diff' + ('_cache_eviction.txt' if use_cache_eviction else '.txt'))
+
+    if OUTPUT_FILE.exists() and OUTPUT_FILE.is_file():
+        OUTPUT_FILE.unlink()
+    
+    with open(OUTPUT_FILE, 'w') as file:
+        model_list = utils.get_models_list(os.path.join(os.path.dirname(__file__), "models", "hf-tiny-random-models-precommit"))
+        print(OUTPUT_FILE)
+        print('ref_diff_map_cache_eviction = {' if use_cache_eviction else 'ref_diff_map = {', file=file)
+
+        for model_id, _, _, _ in model_list:
+            # wrapping in try/catch block to continue printing models even if one has failed
+            try:
+                model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True)
+            except:
+                continue
+
+            before_map = {}
+            for op in model.model.get_ordered_ops():
+                if op.get_type_name() in nodes_to_compare:
+                    before_map[op.get_type_name()] = before_map.get(op.get_type_name(), 0) + 1
+
+            # wrapping in try/catch block to continue printing models even if one has failed
+            try:
+                paged_attention_transformation(model.model, use_cache_eviction, use_cache_eviction)
+            except:
+                continue
+
+            after_map = {}
+            for op in model.model.get_ordered_ops():
+                if op.get_type_name() in nodes_to_compare:
+                    after_map[op.get_type_name()] = after_map.get(op.get_type_name(), 0) + 1
+
+            print(f'\t"{model_id}" : {{', file=file)
+            for op in set(after_map.keys()) | set(before_map.keys()):
+                print(f'\t\t"{op}" : {after_map.get(op, 0) - before_map.get(op, 0)},', file=file)
+            print('\t},', file=file)
+        print('}', file=file)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tests/model_hub_tests/transformation_tests/models/hf-tiny-random-models-precommit b/tests/model_hub_tests/transformation_tests/models/hf-tiny-random-models-precommit
index c3ec331fcda0bc..7c89c451ea4be5 100644
--- a/tests/model_hub_tests/transformation_tests/models/hf-tiny-random-models-precommit
+++ b/tests/model_hub_tests/transformation_tests/models/hf-tiny-random-models-precommit
@@ -40,7 +40,4 @@ Xenova/tiny-random-Phi3ForCausalLM,https://huggingface.co/Xenova/tiny-random-Phi
 facebook/opt-125m,https://huggingface.co/facebook/opt-125m
 facebook/opt-350m,https://huggingface.co/facebook/opt-350m
 katuni4ka/tiny-random-chatglm2,https://huggingface.co/katuni4ka/tiny-random-chatglm2
-katuni4ka/tiny-random-glm4,https://huggingface.co/katuni4ka/tiny-random-glm4
-hf-internal-testing/tiny-random-BioGptForCausalLM,https://huggingface.co/hf-internal-testing/tiny-random-BioGptForCausalLM,xfail,No ScaledDotProductAttention operation observed in the graph CVS-145820
-hf-internal-testing/tiny-random-XGLMForCausalLM,https://huggingface.co/hf-tiny-model-private/tiny-random-XGLMForCausalLM,xfail,No ScaledDotProductAttention operation observed in the graph CVS-145820
-katuni4ka/tiny-random-orion,https://huggingface.co/katuni4ka/tiny-random-orion,xfail,No ScaledDotProductAttention operation observed in the graph CVS-145820
\ No newline at end of file
+katuni4ka/tiny-random-glm4,https://huggingface.co/katuni4ka/tiny-random-glm4
\ No newline at end of file
diff --git a/tests/model_hub_tests/transformation_tests/sdpa2pa_ref_diff.py b/tests/model_hub_tests/transformation_tests/sdpa2pa_ref_diff.py
new file mode 100644
index 00000000000000..23af913d9d102f
--- /dev/null
+++ b/tests/model_hub_tests/transformation_tests/sdpa2pa_ref_diff.py
@@ -0,0 +1,612 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+nodes_to_compare = ("ScaledDotProductAttention", "PagedAttentionExtension", "Parameter", "ReadValue", "Assign")
+
+ref_diff_map = {
+	"hf-internal-testing/tiny-random-LlamaForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-CohereForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-GPTJForCausalLM" : {
+		"PagedAttentionExtension" : 5,
+		"ScaledDotProductAttention" : -5,
+		"Parameter" : 13,
+		"ReadValue" : -10,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-GPTNeoForCausalLM" : {
+		"PagedAttentionExtension" : 4,
+		"ScaledDotProductAttention" : -4,
+		"Parameter" : 11,
+		"ReadValue" : -8,
+		"Assign" : -8,
+	},
+	"hf-internal-testing/tiny-random-GPTNeoXForCausalLM" : {
+		"PagedAttentionExtension" : 5,
+		"ScaledDotProductAttention" : -5,
+		"Parameter" : 13,
+		"ReadValue" : -10,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-MistralForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-CodeGenForCausalLM" : {
+		"PagedAttentionExtension" : 5,
+		"ScaledDotProductAttention" : -5,
+		"Parameter" : 13,
+		"ReadValue" : -10,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/Mixtral-tiny" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-GPTBigCodeForCausalLM" : {
+		"PagedAttentionExtension" : 5,
+		"ScaledDotProductAttention" : -5,
+		"Parameter" : 13,
+		"ReadValue" : -5,
+		"Assign" : -5,
+	},
+	"hf-internal-testing/tiny-random-Starcoder2ForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-BloomForCausalLM" : {
+		"PagedAttentionExtension" : 5,
+		"ScaledDotProductAttention" : -5,
+		"Parameter" : 14,
+		"ReadValue" : -10,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-gpt2" : {
+		"PagedAttentionExtension" : 5,
+		"ScaledDotProductAttention" : -5,
+		"Parameter" : 13,
+		"ReadValue" : -10,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-BlenderbotForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 8,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-PegasusForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 8,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-PhiForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-MptForCausalLM" : {
+		"PagedAttentionExtension" : 5,
+		"ScaledDotProductAttention" : -5,
+		"Parameter" : 14,
+		"ReadValue" : -10,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-StableLmForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-PersimmonForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-FalconForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"hf-tiny-model-private/tiny-random-OPTForCausalLM" : {
+		"PagedAttentionExtension" : 5,
+		"ScaledDotProductAttention" : -5,
+		"Parameter" : 14,
+		"ReadValue" : -10,
+		"Assign" : -10,
+	},
+	"katuni4ka/tiny-random-xverse" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-baichuan2-13b" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-qwen" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-aquilachat" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-aquila2" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-qwen1.5-moe" : {
+		"PagedAttentionExtension" : 4,
+		"ScaledDotProductAttention" : -4,
+		"Parameter" : 11,
+		"ReadValue" : -8,
+		"Assign" : -8,
+	},
+	"katuni4ka/tiny-random-codegen2" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-olmo-hf" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-baichuan2" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-jais" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-internlm" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-internlm2" : {
+		"PagedAttentionExtension" : 4,
+		"ScaledDotProductAttention" : -4,
+		"Parameter" : 11,
+		"ReadValue" : -8,
+		"Assign" : -8,
+	},
+    "katuni4ka/tiny-random-minicpm" : {
+        "ReadValue" : -8,
+        "ScaledDotProductAttention" : -4,
+        "Assign" : -8,
+        "PagedAttentionExtension" : 4,
+        "Parameter" : 11,
+    },
+	"katuni4ka/tiny-random-falcon-40b" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-dbrx" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"fxmarty/tiny-random-GemmaForCausalLM" : {
+		"PagedAttentionExtension" : 1,
+		"ScaledDotProductAttention" : -1,
+		"Parameter" : 5,
+		"ReadValue" : -2,
+		"Assign" : -2,
+	},
+	"fxmarty/tiny-dummy-qwen2" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"fxmarty/really-tiny-falcon-testing" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"Xenova/tiny-random-Phi3ForCausalLM" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"facebook/opt-125m" : {
+		"PagedAttentionExtension" : 12,
+		"ScaledDotProductAttention" : -12,
+		"Parameter" : 28,
+		"ReadValue" : -24,
+		"Assign" : -24,
+	},
+	"facebook/opt-350m" : {
+		"PagedAttentionExtension" : 24,
+		"ScaledDotProductAttention" : -24,
+		"Parameter" : 52,
+		"ReadValue" : -48,
+		"Assign" : -48,
+	},
+	"katuni4ka/tiny-random-chatglm2" : {
+		"PagedAttentionExtension" : 2,
+		"ScaledDotProductAttention" : -2,
+		"Parameter" : 7,
+		"ReadValue" : -4,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-glm4" : {
+		"PagedAttentionExtension" : 6,
+		"ScaledDotProductAttention" : -6,
+		"Parameter" : 15,
+		"ReadValue" : -12,
+		"Assign" : -12,
+	},
+}
+
+ref_diff_map_cache_eviction = {
+	"hf-internal-testing/tiny-random-LlamaForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-CohereForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-GPTJForCausalLM" : {
+		"ScaledDotProductAttention" : -5,
+		"ReadValue" : -10,
+		"PagedAttentionExtension" : 5,
+		"Parameter" : 17,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-GPTNeoForCausalLM" : {
+		"ScaledDotProductAttention" : -4,
+		"ReadValue" : -8,
+		"PagedAttentionExtension" : 4,
+		"Parameter" : 14,
+		"Assign" : -8,
+	},
+	"hf-internal-testing/tiny-random-GPTNeoXForCausalLM" : {
+		"ScaledDotProductAttention" : -5,
+		"ReadValue" : -10,
+		"PagedAttentionExtension" : 5,
+		"Parameter" : 17,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-MistralForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-CodeGenForCausalLM" : {
+		"ScaledDotProductAttention" : -5,
+		"ReadValue" : -10,
+		"PagedAttentionExtension" : 5,
+		"Parameter" : 17,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/Mixtral-tiny" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-GPTBigCodeForCausalLM" : {
+		"ScaledDotProductAttention" : -5,
+		"ReadValue" : -5,
+		"PagedAttentionExtension" : 5,
+		"Parameter" : 17,
+		"Assign" : -5,
+	},
+	"hf-internal-testing/tiny-random-Starcoder2ForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-BloomForCausalLM" : {
+		"ScaledDotProductAttention" : -5,
+		"ReadValue" : -10,
+		"PagedAttentionExtension" : 5,
+		"Parameter" : 18,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-gpt2" : {
+		"ScaledDotProductAttention" : -5,
+		"ReadValue" : -10,
+		"PagedAttentionExtension" : 5,
+		"Parameter" : 17,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-BlenderbotForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 9,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-PegasusForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 9,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-PhiForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-MptForCausalLM" : {
+		"ScaledDotProductAttention" : -5,
+		"ReadValue" : -10,
+		"PagedAttentionExtension" : 5,
+		"Parameter" : 18,
+		"Assign" : -10,
+	},
+	"hf-internal-testing/tiny-random-StableLmForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-PersimmonForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"hf-internal-testing/tiny-random-FalconForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"hf-tiny-model-private/tiny-random-OPTForCausalLM" : {
+		"ScaledDotProductAttention" : -5,
+		"ReadValue" : -10,
+		"PagedAttentionExtension" : 5,
+		"Parameter" : 18,
+		"Assign" : -10,
+	},
+	"katuni4ka/tiny-random-xverse" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-baichuan2-13b" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-qwen" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-aquilachat" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-aquila2" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-qwen1.5-moe" : {
+		"ScaledDotProductAttention" : -4,
+		"ReadValue" : -8,
+		"PagedAttentionExtension" : 4,
+		"Parameter" : 14,
+		"Assign" : -8,
+	},
+	"katuni4ka/tiny-random-codegen2" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-olmo-hf" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-baichuan2" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-jais" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-internlm" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-internlm2" : {
+		"ScaledDotProductAttention" : -4,
+		"ReadValue" : -8,
+		"PagedAttentionExtension" : 4,
+		"Parameter" : 14,
+		"Assign" : -8,
+	},
+    "katuni4ka/tiny-random-minicpm" : {
+        "ScaledDotProductAttention" : -4,
+        "Parameter" : 14,
+        "PagedAttentionExtension" : 4,
+        "ReadValue" : -8,
+        "Assign" : -8,
+    },
+	"katuni4ka/tiny-random-falcon-40b" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-dbrx" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"fxmarty/tiny-random-GemmaForCausalLM" : {
+		"ScaledDotProductAttention" : -1,
+		"ReadValue" : -2,
+		"PagedAttentionExtension" : 1,
+		"Parameter" : 5,
+		"Assign" : -2,
+	},
+	"fxmarty/tiny-dummy-qwen2" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"fxmarty/really-tiny-falcon-testing" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"Xenova/tiny-random-Phi3ForCausalLM" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"facebook/opt-125m" : {
+		"ScaledDotProductAttention" : -12,
+		"ReadValue" : -24,
+		"PagedAttentionExtension" : 12,
+		"Parameter" : 39,
+		"Assign" : -24,
+	},
+	"facebook/opt-350m" : {
+		"ScaledDotProductAttention" : -24,
+		"ReadValue" : -48,
+		"PagedAttentionExtension" : 24,
+		"Parameter" : 75,
+		"Assign" : -48,
+	},
+	"katuni4ka/tiny-random-chatglm2" : {
+		"ScaledDotProductAttention" : -2,
+		"ReadValue" : -4,
+		"PagedAttentionExtension" : 2,
+		"Parameter" : 8,
+		"Assign" : -4,
+	},
+	"katuni4ka/tiny-random-glm4" : {
+		"ScaledDotProductAttention" : -6,
+		"ReadValue" : -12,
+		"PagedAttentionExtension" : 6,
+		"Parameter" : 20,
+		"Assign" : -12,
+	},
+}
diff --git a/tests/model_hub_tests/transformation_tests/test_pa_transformation.py b/tests/model_hub_tests/transformation_tests/test_pa_transformation.py
index dc65324d4f028b..02481439818f28 100644
--- a/tests/model_hub_tests/transformation_tests/test_pa_transformation.py
+++ b/tests/model_hub_tests/transformation_tests/test_pa_transformation.py
@@ -6,6 +6,7 @@
 from optimum.intel import OVModelForCausalLM
 from models_hub_common.utils import retry
 import models_hub_common.utils as utils
+from sdpa2pa_ref_diff import ref_diff_map, ref_diff_map_cache_eviction, nodes_to_compare
 import pytest
 import os
 import re
@@ -14,15 +15,28 @@
 def run_pa(tmp_path, model_id, model_link, use_block_indices_inputs, use_score_outputs):
     model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True)
 
+    before_map = {}
+    for op in model.model.get_ordered_ops():
+        if op.get_type_name() in nodes_to_compare:
+            before_map[op.get_type_name()] = before_map.get(op.get_type_name(), 0) + 1
+
     paged_attention_transformation(model.model, use_block_indices_inputs, use_score_outputs)
 
-    # Test that a _PagedAttentionExtension node appeared after the transformation.
-    pa_counter = 0
+    after_map = {}
     for op in model.model.get_ordered_ops():
-        if isinstance(op, _PagedAttentionExtension):
-            pa_counter += 1
+        if op.get_type_name() in nodes_to_compare:
+            after_map[op.get_type_name()] = after_map.get(op.get_type_name(), 0) + 1
+
+    # Collect the changes of nodes from nodes_to_compare
+    # And check if the numbers correspond to the reference ones
+    resulting_map = {}
+    for op in set(after_map.keys()) | set(before_map.keys()):
+        resulting_map[op] = after_map.get(op, 0) - before_map.get(op, 0)
+
+    use_cache_eviction = use_block_indices_inputs and use_score_outputs
+    reference_map = ref_diff_map_cache_eviction[model_id] if use_cache_eviction else ref_diff_map[model_id]
 
-    assert pa_counter > 0, f"The model '{model_id}' has no _PagedAttentionExtension present."
+    assert reference_map == resulting_map
 
     model_inputs = model.model.inputs
     for input in model_inputs:
@@ -45,7 +59,8 @@ def run_pa(tmp_path, model_id, model_link, use_block_indices_inputs, use_score_o
                 if re.search(block_indices_pattern, name):
                     block_indices_counter += 1
 
-        assert(block_indices_counter == pa_counter)
+        assert block_indices_counter == resulting_map["PagedAttentionExtension"], \
+               f"The number of block_indices inputs doesn't correspond to the expected value. Expected {resulting_map['PagedAttentionExtension']}, received {block_indices_counter}"
     
     if (use_score_outputs):
         score_pattern = r'scores\.[0-9]+'
@@ -57,7 +72,8 @@ def run_pa(tmp_path, model_id, model_link, use_block_indices_inputs, use_score_o
                 if re.search(score_pattern, name):
                     score_outputs_counter += 1
 
-        assert(score_outputs_counter == pa_counter)
+        assert block_indices_counter == resulting_map["PagedAttentionExtension"], \
+               f"The number of scores outputs doesn't correspond to the expected value. Expected {resulting_map['PagedAttentionExtension']}, received {block_indices_counter}"
 
 @pytest.mark.precommit
 @pytest.mark.parametrize("model_name, model_link, mark, reason", utils.get_models_list(os.path.join(os.path.dirname(__file__), "models", "hf-tiny-random-models-precommit")))

From 9a02e5475b946ee710a55c30d391a937f46c1d0a Mon Sep 17 00:00:00 2001
From: Halm Zenger <halm.zenger@gmail.com>
Date: Sun, 20 Oct 2024 15:17:09 +0100
Subject: [PATCH 29/32] [JAX FE] Support lax.argmax operation for JAX (#26671)

### Details:
 - Support lax.argmax for JAX and create relevant layer test
 - 2 util improvements
   - Fix `num_inputs_check` not checking max inputs
   - Better error message when param name not exist

### Tickets:
 - #26574

---------

Co-authored-by: Roman Kazantsev <roman.kazantsev@intel.com>
---
 .../openvino/frontend/jax/node_context.hpp    |  1 +
 src/frontends/jax/src/op/argmax.cpp           | 42 +++++++++++++
 src/frontends/jax/src/op_table.cpp            |  2 +
 src/frontends/jax/src/utils.cpp               |  1 +
 tests/layer_tests/jax_tests/test_argmax.py    | 62 +++++++++++++++++++
 5 files changed, 108 insertions(+)
 create mode 100644 src/frontends/jax/src/op/argmax.cpp
 create mode 100644 tests/layer_tests/jax_tests/test_argmax.py

diff --git a/src/frontends/jax/include/openvino/frontend/jax/node_context.hpp b/src/frontends/jax/include/openvino/frontend/jax/node_context.hpp
index 101161a4ec03c9..015713ad72847d 100644
--- a/src/frontends/jax/include/openvino/frontend/jax/node_context.hpp
+++ b/src/frontends/jax/include/openvino/frontend/jax/node_context.hpp
@@ -101,6 +101,7 @@ class NodeContext : public frontend::NodeContext {
     }
 
     Output<Node> get_param(const std::string& name) const {
+        FRONT_END_GENERAL_CHECK(m_param_name_to_id.count(name), "No param id corresponding name exists: ", name);
         auto id = m_param_name_to_id.at(name);
         FRONT_END_GENERAL_CHECK(m_tensor_map->count(id), "No tensor corresponding param id: ", id, " exist.");
         return m_tensor_map->at(id);
diff --git a/src/frontends/jax/src/op/argmax.cpp b/src/frontends/jax/src/op/argmax.cpp
new file mode 100644
index 00000000000000..60d852c6d0f358
--- /dev/null
+++ b/src/frontends/jax/src/op/argmax.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/frontend/jax/node_context.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/squeeze.hpp"
+#include "openvino/op/topk.hpp"
+#include "utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace jax {
+namespace op {
+
+using namespace ov::op;
+
+OutputVector translate_argmax(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+    Output<Node> input = context.get_input(0);
+    auto axis_val = context.const_named_param<int64_t>("axes");
+    auto axis = context.const_named_param<std::shared_ptr<v0::Constant>>("axes");
+    auto dtype = convert_dtype(context.const_named_param<int64_t>("index_dtype"));
+
+    auto k = std::make_shared<v0::Constant>(element::i64, Shape{}, 1);
+    auto topk = std::make_shared<v11::TopK>(input,
+                                            k,
+                                            axis_val,
+                                            v11::TopK::Mode::MAX,
+                                            v1::TopK::SortType::SORT_VALUES,
+                                            dtype,
+                                            true);
+    auto indices = topk->output(1);
+
+    auto res = std::make_shared<v0::Squeeze>(indices, axis);
+    return {res};
+};
+
+}  // namespace op
+}  // namespace jax
+}  // namespace frontend
+}  // namespace ov
\ No newline at end of file
diff --git a/src/frontends/jax/src/op_table.cpp b/src/frontends/jax/src/op_table.cpp
index 5e92e3de6e212a..500226594fea13 100644
--- a/src/frontends/jax/src/op_table.cpp
+++ b/src/frontends/jax/src/op_table.cpp
@@ -36,6 +36,7 @@ namespace op {
     template <class T>     \
     OutputVector op(const ov::frontend::jax::NodeContext& node)
 
+OP_CONVERTER(translate_argmax);
 OP_T_CONVERTER(translate_binary_op);
 OP_CONVERTER(translate_broadcast_in_dim);
 OP_CONVERTER(translate_concatenate);
@@ -59,6 +60,7 @@ OP_CONVERTER(translate_transpose);
 // Supported ops for Jaxpr
 const std::map<std::string, CreatorFunction> get_supported_ops_jaxpr() {
     return {{"add", op::translate_1to1_match_2_inputs<v1::Add>},
+            {"argmax", op::translate_argmax},
             {"broadcast_in_dim", op::translate_broadcast_in_dim},
             {"concatenate", op::translate_concatenate},
             {"constant", op::translate_constant},
diff --git a/src/frontends/jax/src/utils.cpp b/src/frontends/jax/src/utils.cpp
index d47abfbba56188..f626031ec8dc58 100644
--- a/src/frontends/jax/src/utils.cpp
+++ b/src/frontends/jax/src/utils.cpp
@@ -16,6 +16,7 @@ namespace jax {
 void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) {
     auto inputs = context.inputs();
     FRONT_END_OP_CONVERSION_CHECK(inputs.size() >= min_inputs, "Got less inputs than expected");
+    FRONT_END_OP_CONVERSION_CHECK(inputs.size() <= max_inputs, "Got more inputs than expected");
 }
 
 void num_inputs_check(const NodeContext& context, size_t min_inputs) {
diff --git a/tests/layer_tests/jax_tests/test_argmax.py b/tests/layer_tests/jax_tests/test_argmax.py
new file mode 100644
index 00000000000000..372aede2b4ba33
--- /dev/null
+++ b/tests/layer_tests/jax_tests/test_argmax.py
@@ -0,0 +1,62 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import pytest
+from jax import lax
+from jax import numpy as jnp
+
+from jax_layer_test_class import JaxLayerTest
+
+rng = np.random.default_rng(706670)
+
+
+class TestArgmax(JaxLayerTest):
+    def _prepare_input(self):
+        if np.issubdtype(self.input_type, np.floating):
+            x = rng.uniform(-5.0, 5.0,
+                            self.input_shape).astype(self.input_type)
+        elif np.issubdtype(self.input_type, np.signedinteger):
+            x = rng.integers(-8, 8, self.input_shape).astype(self.input_type)
+        else:
+            x = rng.integers(0, 8, self.input_shape).astype(self.input_type)
+
+        if self.input_duplicate:
+            x = np.concatenate((x, x), axis=self.axis)
+
+        x = jnp.array(x)
+        return [x]
+
+    def create_model(self, input_shape, axis, input_type, index_dtype, input_duplicate):
+        self.input_shape = input_shape
+        self.axis = axis
+        self.input_type = input_type
+        self.input_duplicate = input_duplicate
+
+        def jax_argmax(inp):
+            out = lax.argmax(inp, axis, index_dtype)
+            return out
+
+        return jax_argmax, None, 'argmax'
+
+    # Only [0, rank - 1] are valid axes for lax.argmax
+    @pytest.mark.parametrize('input_shape, axis', [([64], 0),
+                                                   ([64, 16], 0),
+                                                   ([64, 16], 1),
+                                                   ([48, 23, 54], 0),
+                                                   ([48, 23, 54], 1),
+                                                   ([48, 23, 54], 2),
+                                                   ([2, 18, 32, 25], 0),
+                                                   ([2, 18, 32, 25], 1),
+                                                   ([2, 18, 32, 25], 2),
+                                                   ([2, 18, 32, 25], 3)])
+    @pytest.mark.parametrize('input_type', [np.int8, np.uint8, np.int16, np.uint16,
+                                            np.int32, np.uint32, np.int64, np.uint64,
+                                            np.float16, np.float32, np.float64])
+    @pytest.mark.parametrize("index_dtype", [np.int32, np.int64])
+    @pytest.mark.parametrize("input_duplicate", [False, True])
+    @pytest.mark.nightly
+    @pytest.mark.precommit_jax_fe
+    def test_argmax(self, ie_device, precision, ir_version, input_shape, axis, input_type, index_dtype, input_duplicate):
+        self._test(*self.create_model(input_shape, axis, input_type, index_dtype, input_duplicate),
+                   ie_device, precision, ir_version)

From 2be7e5f945a2eb116181a670ebb338a8fa533f6a Mon Sep 17 00:00:00 2001
From: Wilson Seok <wilson.seok@intel.com>
Date: Sun, 20 Oct 2024 21:50:57 -0700
Subject: [PATCH 30/32] [GPU] Fix fused op macro for dynamic shape eltwise
 fusing into convolution, fix deconvolution attribute kernel param when 1d
 (#27010)

### Details:
- fix fused op input load macro for convolution with dynamic shape
eltwise fusing
 - fix deconvolution kernel  stride, pad, dilation axis extension for 1d

### Tickets:
 - 152406
---
 .../src/graph/impls/ocl/deconvolution.cpp     |  16 +-
 .../intel_gpu/src/kernel_selector/jitter.cpp  |  16 +-
 .../convolution_backprop_data.cpp             |  36 +++
 .../dynamic/convolution.cpp                   | 253 ++++++++++++++++++
 .../dynamic/convolution_backprop_data.cpp     |  49 +++-
 5 files changed, 358 insertions(+), 12 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/deconvolution.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/deconvolution.cpp
index 5e3462a6256364..95bd66867c1b8f 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/deconvolution.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/deconvolution.cpp
@@ -5,6 +5,7 @@
 #include "primitive_base.hpp"
 
 #include "deconvolution_inst.h"
+#include "intel_gpu/plugin/common_utils.hpp"
 #include "deconvolution/deconvolution_kernel_selector.h"
 #include "deconvolution/deconvolution_kernel_base.h"
 
@@ -54,19 +55,16 @@ struct deconvolution_impl : typed_primitive_impl_ocl<deconvolution> {
 
         params.filterSize = { kx, ky, kz };
 
-        uint32_t pad_z = std::max<std::ptrdiff_t>(pad.size() >= 3 ? pad[pad.size() - 3] : 0, 0);
-        uint32_t pad_y = std::max<std::ptrdiff_t>(pad.size() >= 2 ? pad[pad.size() - 2] : 0, 0);
-        uint32_t pad_x = std::max<std::ptrdiff_t>(pad.size() >= 1 ? pad[pad.size() - 1] : 0, 0);
+        uint32_t pad_x, pad_y, pad_z;
+        std::tie(pad_x, pad_y, pad_z) = ov::intel_gpu::get_xyz<ov::CoordinateDiff, uint32_t>(pad, 0);
         params.padding = {pad_x, pad_y, pad_z};
 
-        uint32_t stride_z = stride.size() >= 3 ? static_cast<uint32_t>(stride[stride.size() - 3]) : 1;
-        uint32_t stride_y = stride.size() >= 2 ? static_cast<uint32_t>(stride[stride.size() - 2]) : 1;
-        uint32_t stride_x = stride.size() >= 1 ? static_cast<uint32_t>(stride[stride.size() - 1]) : 1;
+        uint32_t stride_x, stride_y, stride_z;
+        std::tie(stride_x, stride_y, stride_z) = ov::intel_gpu::get_xyz<ov::Strides, uint32_t>(stride, 1);
         params.stride = {stride_x, stride_y, stride_z};
 
-        uint32_t dilation_z = dilation.size() >= 3 ? static_cast<uint32_t>(dilation[dilation.size() - 3]) : 1;
-        uint32_t dilation_y = dilation.size() >= 2 ? static_cast<uint32_t>(dilation[dilation.size() - 2]) : 1;
-        uint32_t dilation_x = dilation.size() >= 1 ? static_cast<uint32_t>(dilation[dilation.size() - 1]) : 1;
+        uint32_t dilation_x, dilation_y, dilation_z;
+        std::tie(dilation_x, dilation_y, dilation_z) = ov::intel_gpu::get_xyz<ov::Strides, uint32_t>(dilation, 1);
         params.dilation = {dilation_x, dilation_y, dilation_z};
 
         return params;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
index 480282b6060f16..33d13429fdcf3f 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
@@ -2197,7 +2197,21 @@ std::string FusedOpsCodeGenerator::GetJitLoad(const FusedOpsConfiguration& conf,
 
             if (vec_size > 1) {
                 return block_read;
-            } else if (input_tensor.LogicalSize() > 1) {
+            }
+
+            bool multiple_elements = false;
+            // For dynamic shape input tensor, check any one of static dimension has more than one element.
+            if (input_tensor.is_dynamic()) {
+                for (auto dim : input_tensor.GetDims()) {
+                    auto v = dim.v;
+                    if (v > 1) {
+                        multiple_elements = true;
+                        break;
+                    }
+                }
+            }
+
+            if (input_tensor.LogicalSize() > 1 || multiple_elements) {
                 // Currently we assume that in such scenario we can safely load sub_group_size elements from the pointer
                 return Broadcast(block_read, input_dt, vec_size);
             } else {
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/convolution_backprop_data.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/convolution_backprop_data.cpp
index f379b29ce23389..489f4096795361 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/convolution_backprop_data.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/convolution_backprop_data.cpp
@@ -211,4 +211,40 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvolutionBackpropData3D_AutoPadding_OutputPaddi
                                 ::testing::Values(ov::test::utils::DEVICE_GPU)),
                         ConvolutionBackpropDataLayerTest::getTestCaseName);
 
+const std::vector<size_t> numOutChannels1d = {256};
+
+/* ============= 1D ConvolutionBackpropData ============= */
+const std::vector<ov::element::Type> netPrecisions1D = {
+        ov::element::f32
+};
+
+const std::vector<std::vector<ov::Shape>> inputShapes1D = {{{1, 512, 577}}};
+const std::vector<std::vector<size_t >> kernels1D = {{16}};
+const std::vector<std::vector<size_t >> strides1D = {{8}};
+const std::vector<std::vector<ptrdiff_t>> padBegins1D = {{4}};
+const std::vector<std::vector<ptrdiff_t>> padEnds1D = {{4}};
+const std::vector<std::vector<size_t >> dilations1D = {{1}};
+
+
+const std::vector<std::vector<ptrdiff_t>> outputPadding1D = {{0}};
+
+const auto conv1DParams_ExplicitPadding_output_padding = ::testing::Combine(
+        ::testing::ValuesIn(kernels1D),
+        ::testing::ValuesIn(strides1D),
+        ::testing::ValuesIn(padBegins1D),
+        ::testing::ValuesIn(padEnds1D),
+        ::testing::ValuesIn(dilations1D),
+        ::testing::ValuesIn(numOutChannels1d),
+        ::testing::Values(ov::op::PadType::EXPLICIT),
+        ::testing::ValuesIn(outputPadding1D)
+);
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConvolutionBackpropData1D_ExplicitPadding, ConvolutionBackpropDataLayerTest,
+                        ::testing::Combine(
+                                conv1DParams_ExplicitPadding_output_padding,
+                                ::testing::ValuesIn(netPrecisions1D),
+                                ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inputShapes1D)),
+                                ::testing::ValuesIn(emptyOutputShape),
+                                ::testing::Values(ov::test::utils::DEVICE_GPU)),
+                        ConvolutionBackpropDataLayerTest::getTestCaseName);
 }  // namespace
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/convolution.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/convolution.cpp
index 093fca68b482fa..216a1b397c90bc 100644
--- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/convolution.cpp
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/convolution.cpp
@@ -4,6 +4,7 @@
 #include "common_test_utils/ov_tensor_utils.hpp"
 #include "common_test_utils/node_builders/activation.hpp"
 #include "common_test_utils/node_builders/convolution.hpp"
+#include "common_test_utils/node_builders/eltwise.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
 #include "shared_test_classes/single_op/convolution.hpp"
 
@@ -317,4 +318,256 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvolutionLayerGPUTest_dynamic3DAsymPad, Convolu
                 ::testing::Values(false)),
                 ConvolutionLayerGPUTestDynamic::getTestCaseName);
 
+typedef std::tuple<
+        convSpecificParams,
+        ov::element::Type,              // Model type
+        std::vector<InputShape>,        // Input shapes
+        std::string,                    // Device name
+        bool                            // activation fusing
+> convLayerFusingTestParamsSet;
+
+
+class ConvolutionLayerGPUTestDynamicEltwiseFusing : public testing::WithParamInterface<convLayerFusingTestParamsSet>,
+                                                    virtual public ov::test::SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<convLayerFusingTestParamsSet>& obj) {
+        convSpecificParams convParams;
+        ov::element::Type model_type;
+        std::vector<InputShape> inputShapes;
+        std::string targetDevice;
+        bool activationFusing;
+        std::tie(convParams, model_type, inputShapes, targetDevice, activationFusing) = obj.param;
+
+        ov::op::PadType padType;
+        std::vector<size_t> kernel, stride, dilation;
+        std::vector<ptrdiff_t> padBegin, padEnd;
+        size_t convOutChannels;
+        std::tie(kernel, stride, padBegin, padEnd, dilation, convOutChannels, padType) = convParams;
+
+        std::ostringstream result;
+        for (const auto& inputShape : inputShapes) {
+                result << "IS=";
+                result  << ov::test::utils::partialShape2str({inputShape.first}) << "_";
+                result << "TS=(";
+                for (const auto& shape : inputShape.second) {
+                result << ov::test::utils::vec2str(shape) << "_";
+                }
+        }
+        result << ")_";
+        result << "K" << ov::test::utils::vec2str(kernel) << "_";
+        result << "S" << ov::test::utils::vec2str(stride) << "_";
+        result << "PB" << ov::test::utils::vec2str(padBegin) << "_";
+        result << "PE" << ov::test::utils::vec2str(padEnd) << "_";
+        result << "D=" << ov::test::utils::vec2str(dilation) << "_";
+        result << "O=" << convOutChannels << "_";
+        result << "AP=" << padType << "_";
+        result << "netPRC=" << model_type << "_";
+        result << "trgDev=" << targetDevice << "_";
+        result << "activationFusing=" << activationFusing;
+
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        convSpecificParams convParams;
+        std::vector<InputShape> inputShapes;
+        auto model_type = ov::element::undefined;
+        bool activationFusing;
+        std::tie(convParams, model_type, inputShapes, targetDevice, activationFusing) = this->GetParam();
+
+        init_input_shapes({inputShapes});
+
+        ov::op::PadType padType;
+        std::vector<size_t> kernel, stride, dilation;
+        std::vector<ptrdiff_t> padBegin, padEnd;
+        size_t convOutChannels;
+        std::tie(kernel, stride, padBegin, padEnd, dilation, convOutChannels, padType) = convParams;
+
+        ov::ParameterVector inputParams;
+        for (auto&& shape : inputDynamicShapes)
+            inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(model_type, shape));
+
+        auto convolutionNode = ov::test::utils::make_convolution(inputParams.front(), model_type, kernel, stride, padBegin,
+                                                                 padEnd, dilation, padType, convOutChannels);
+        if (activationFusing) {
+                auto activationNode = ov::test::utils::make_activation(convolutionNode, model_type, ov::test::utils::ActivationTypes::Relu);
+                auto eltwiseNode = ov::test::utils::make_eltwise(inputParams.back(), activationNode, ov::test::utils::EltwiseTypes::ADD);
+
+                ov::ResultVector results;
+                for (size_t i = 0; i < eltwiseNode->get_output_size(); i++)
+                        results.push_back(std::make_shared<ov::op::v0::Result>(eltwiseNode->output(i)));
+
+                function = std::make_shared<ov::Model>(results, inputParams, "Convolution");
+        } else {
+                auto eltwiseNode = ov::test::utils::make_eltwise(inputParams.back(), convolutionNode, ov::test::utils::EltwiseTypes::ADD);
+
+                ov::ResultVector results;
+                for (size_t i = 0; i < eltwiseNode->get_output_size(); i++)
+                        results.push_back(std::make_shared<ov::op::v0::Result>(eltwiseNode->output(i)));
+
+                function = std::make_shared<ov::Model>(results, inputParams, "Convolution");
+        }
+    }
+};
+
+TEST_P(ConvolutionLayerGPUTestDynamicEltwiseFusing, Inference) {
+    run();
+}
+const std::vector<std::vector<ov::test::InputShape>> dynInputShapes1D_test = {
+        {
+        {
+                {1, 192, ov::Dimension::dynamic()},
+                {{1, 192, 191}}
+        },
+        {
+                {1, 192, ov::Dimension::dynamic()},
+                {{1, 192, 1}}
+        }
+        },
+        {
+        {
+                {ov::Dimension::dynamic(), 192, ov::Dimension::dynamic()},
+                {{1, 192, 257}}
+        },
+        {
+                {1, 1, ov::Dimension::dynamic()},
+                {{1, 1, 257}}
+        }
+        },
+        {
+        {
+                {ov::Dimension::dynamic(), 192, ov::Dimension::dynamic()},
+                {{1, 192, 257}}
+        },
+        {
+                {1, ov::Dimension::dynamic(), ov::Dimension::dynamic()},
+                {{1, 1, 1}}
+        }
+        },
+        {
+        {
+                {ov::Dimension::dynamic(), 192, ov::Dimension::dynamic()},
+                {{1, 192, 1}}
+        },
+        {
+                {1, ov::Dimension::dynamic(), ov::Dimension::dynamic()},
+                {{1, 1, 1}}
+        }
+        },
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConvolutionLayerGPUTest_dynamic1D_test_0, ConvolutionLayerGPUTestDynamicEltwiseFusing,
+        ::testing::Combine(
+                ::testing::Combine(
+                        ::testing::Values(std::vector<size_t>{1}),
+                        ::testing::Values(std::vector<size_t>{1}),
+                        ::testing::Values(std::vector<ptrdiff_t>{0}),
+                        ::testing::Values(std::vector<ptrdiff_t>{0}),
+                        ::testing::Values(std::vector<size_t>{1}),
+                        ::testing::Values(192),
+                        ::testing::Values(ov::op::PadType::EXPLICIT)),
+                ::testing::Values(ov::element::f32),
+                ::testing::ValuesIn(dynInputShapes1D_test),
+                ::testing::Values<std::string>(ov::test::utils::DEVICE_GPU),
+                ::testing::Values(false)),
+                ConvolutionLayerGPUTestDynamicEltwiseFusing::getTestCaseName);
+
+const std::vector<std::vector<ov::test::InputShape>> dynInputShapes1D_test1 = {
+        {
+        {
+                {1, 512, ov::Dimension::dynamic()},
+                {{1, 512, 191}}
+        },
+        {
+                {1, 512, ov::Dimension::dynamic()},
+                {{1, 512, 1}}
+        }
+        },
+        {
+        {
+                {ov::Dimension::dynamic(), 512, ov::Dimension::dynamic()},
+                {{1, 512, 191}}
+        },
+        {
+                {1, 1, ov::Dimension::dynamic()},
+                {{1, 1, 191}}
+        }
+        },
+        {
+        {
+                {ov::Dimension::dynamic(), 512, ov::Dimension::dynamic()},
+                {{1, 512, 191}}
+        },
+        {
+                {1, 1, ov::Dimension::dynamic()},
+                {{1, 1, 1}}
+        }
+        },
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConvolutionLayerGPUTest_dynamic1D_test_1, ConvolutionLayerGPUTestDynamicEltwiseFusing,
+        ::testing::Combine(
+                ::testing::Combine(
+                        ::testing::Values(std::vector<size_t>{1}),
+                        ::testing::Values(std::vector<size_t>{1}),
+                        ::testing::Values(std::vector<ptrdiff_t>{0}),
+                        ::testing::Values(std::vector<ptrdiff_t>{0}),
+                        ::testing::Values(std::vector<size_t>{1}),
+                        ::testing::Values(512),
+                        ::testing::Values(ov::op::PadType::EXPLICIT)),
+                ::testing::Values(ov::element::f32),
+                ::testing::ValuesIn(dynInputShapes1D_test1),
+                ::testing::Values<std::string>(ov::test::utils::DEVICE_GPU),
+                ::testing::Values(false)),
+                ConvolutionLayerGPUTestDynamicEltwiseFusing::getTestCaseName);
+
+const std::vector<std::vector<ov::test::InputShape>> dynInputShapes1D_test2 = {
+        {
+        {
+                {1, 2048, ov::Dimension::dynamic()},
+                {{1, 2048, 191}}
+        },
+        {
+                {1, 2048, ov::Dimension::dynamic()},
+                {{1, 2048, 1}}
+        }
+        },
+        {
+        {
+                {1, 2048, ov::Dimension::dynamic()},
+                {{1, 2048, 191}}
+        },
+        {
+                {ov::Dimension::dynamic(), 1, ov::Dimension::dynamic()},
+                {{1, 1, 191}}
+        }
+        },
+        {
+        {
+                {1, 2048, ov::Dimension::dynamic()},
+                {{1, 2048, 191}}
+        },
+        {
+                {ov::Dimension::dynamic(), 1, ov::Dimension::dynamic()},
+                {{1, 1, 1}}
+        }
+        },
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConvolutionLayerGPUTest_dynamic1D_test_2, ConvolutionLayerGPUTestDynamicEltwiseFusing,
+        ::testing::Combine(
+                ::testing::Combine(
+                        ::testing::Values(std::vector<size_t>{1}),
+                        ::testing::Values(std::vector<size_t>{1}),
+                        ::testing::Values(std::vector<ptrdiff_t>{0}),
+                        ::testing::Values(std::vector<ptrdiff_t>{0}),
+                        ::testing::Values(std::vector<size_t>{1}),
+                        ::testing::Values(2048),
+                        ::testing::Values(ov::op::PadType::EXPLICIT)),
+                ::testing::Values(ov::element::f32),
+                ::testing::ValuesIn(dynInputShapes1D_test2),
+                ::testing::Values<std::string>(ov::test::utils::DEVICE_GPU),
+                ::testing::Values(false)),
+                ConvolutionLayerGPUTestDynamicEltwiseFusing::getTestCaseName);
 }  // namespace
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/convolution_backprop_data.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/convolution_backprop_data.cpp
index 6b255c9981c08a..98176acfc9bdc7 100644
--- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/convolution_backprop_data.cpp
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/convolution_backprop_data.cpp
@@ -102,8 +102,8 @@ class DeconvolutionLayerGPUTest : public testing::WithParamInterface<DeconvLayer
                 tensor = ov::Tensor(funcInput.get_element_type(), targetInputStaticShapes[i], outShapeData[inferRequestNum].data());
             } else {
                 ov::test::utils::InputGenerateData in_data;
-                in_data.start_from = 0;
-                in_data.range = 2560;
+                in_data.start_from = -20;
+                in_data.range = 20;
                 in_data.resolution = 256;
                 tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], in_data);
             }
@@ -280,4 +280,49 @@ INSTANTIATE_TEST_SUITE_P(smoke_Deconv_2D_Dynamic_OutputShape_FP32, Deconvolution
         ::testing::Values(emptyAdditionalConfig)),
     DeconvolutionLayerGPUTest::getTestCaseName);
 
+
+const std::vector<std::vector<ptrdiff_t>> emptyOutputPadding1d = { {0} };
+
+/* ============= Deconvolution params ============= */
+const std::vector<size_t> numOutChannels1d = { 256 };
+
+/* ============= Deconvolution params (1D) ============= */
+const std::vector<std::vector<size_t>> kernels1d = { {16} };
+const std::vector<std::vector<size_t>> strides1d = { {8} };
+const std::vector<std::vector<ptrdiff_t>> padBegins1d = { {4} };
+const std::vector<std::vector<ptrdiff_t>> padEnds1d = { {4} };
+const std::vector<std::vector<size_t>> dilations1d = { {1} };
+
+/* ============= Deconvolution (1D) ============= */
+const auto convParams_ExplicitPadding_1D = ::testing::Combine(
+    ::testing::ValuesIn(kernels1d),
+    ::testing::ValuesIn(strides1d),
+    ::testing::ValuesIn(padBegins1d),
+    ::testing::ValuesIn(padEnds1d),
+    ::testing::ValuesIn(dilations1d),
+    ::testing::ValuesIn(numOutChannels1d),
+    ::testing::Values(ov::op::PadType::EXPLICIT),
+    ::testing::ValuesIn(emptyOutputPadding1d)
+);
+
+const std::vector<DeconvInputData> dyn_1D_inputs_smoke = {
+    DeconvInputData{
+        InputShape{{1, 512, -1}, {{1, 512, 577}}},
+        ov::test::utils::InputLayerType::CONSTANT,
+        {}
+    },
+};
+
+const std::vector<ov::element::Type> netPrecisions1D = {
+        ov::element::f32
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Deconv_1D_Dynamic_FP32, DeconvolutionLayerGPUTest,
+    ::testing::Combine(
+        convParams_ExplicitPadding_1D,
+        ::testing::ValuesIn(dyn_1D_inputs_smoke),
+        ::testing::ValuesIn(netPrecisions1D),
+        ::testing::Values(ov::test::utils::DEVICE_GPU),
+        ::testing::Values(emptyAdditionalConfig)),
+    DeconvolutionLayerGPUTest::getTestCaseName);
 } // namespace

From d0056bd5154d556b748d1e4d2d6bebc62c25444a Mon Sep 17 00:00:00 2001
From: Aleksandr Voron <aleksandr.voron@intel.com>
Date: Mon, 21 Oct 2024 07:21:06 +0200
Subject: [PATCH 31/32] [CPU][ARM] Enable fast math in ACL deconvolution
 executor (#26615)

### Details:
- ACL deconvolution `fast_math` option is enabled on `PERFORMANCE` mode.
- This option enables fast math computation in ACL. In case this flag
were set, ACL could dispatch the fastest implementation available which
may introduce a drop of accuracy as well.
- Accuracy testing on dataset subset highlights some deviations from
reference values. Results are attached to the ticket.

### Tickets:
 - CVS-152534
---
 src/plugins/intel_cpu/src/config.cpp                        | 6 ++++++
 src/plugins/intel_cpu/src/config.h                          | 3 +++
 src/plugins/intel_cpu/src/nodes/deconv.cpp                  | 3 +++
 .../intel_cpu/src/nodes/executors/acl/acl_deconv.cpp        | 5 +++--
 src/plugins/intel_cpu/src/nodes/executors/deconv.hpp        | 3 +++
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
index 2b9cdcc4ac1203..421dca07747932 100644
--- a/src/plugins/intel_cpu/src/config.cpp
+++ b/src/plugins/intel_cpu/src/config.cpp
@@ -400,6 +400,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
             inferencePrecision = ov::element::undefined;
         }
     }
+    // enable ACL fast math in PERFORMANCE mode
+#if defined(OV_CPU_WITH_ACL)
+    if (executionMode == ov::hint::ExecutionMode::PERFORMANCE) {
+        aclFastMath = true;
+    }
+#endif
     // disable dynamic quantization and kv quantization for best accuracy
     if (executionMode == ov::hint::ExecutionMode::ACCURACY) {
         if (!fcDynamicQuantizationGroupSizeSetExplicitly) {
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
index eeb8e78f5fa91a..79cdf3a5e827ec 100644
--- a/src/plugins/intel_cpu/src/config.h
+++ b/src/plugins/intel_cpu/src/config.h
@@ -53,6 +53,9 @@ struct Config {
     uint64_t fcDynamicQuantizationGroupSize = 32;
     ov::element::Type kvCachePrecision = ov::element::f16;
     bool fcDynamicQuantizationGroupSizeSetExplicitly = false;
+#if defined(OV_CPU_WITH_ACL)
+    bool aclFastMath = false;
+#endif
 #if defined(OPENVINO_ARCH_X86_64)
     size_t rtCacheCapacity = 5000ul;
 #else
diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp
index 57046a0a06d55b..8a7f95268b4f3a 100644
--- a/src/plugins/intel_cpu/src/nodes/deconv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp
@@ -219,6 +219,9 @@ Deconvolution::Deconvolution(const std::shared_ptr<ov::Node>& op,
     for (size_t i = 0; i < deconvAttrs.dilation.size(); i++) {
         deconvAttrs.kernel.push_back(weightDims[withGroups + 2 + i]);
     }
+#if defined(OV_CPU_WITH_ACL)
+    deconvAttrs.aclFastMath = context->getConfig().aclFastMath;
+#endif
 
     externOutShape = inputShapes.size() == 3;
     biasPort = externOutShape ? 3 : 2;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
index 1345451669bdec..7d400bf96d7cb0 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
@@ -99,7 +99,7 @@ bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs,
 
     deconv = std::make_unique<arm_compute::NEDeconvolutionLayer>();
     configureThreadSafe([&] {
-        deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info);
+        deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info, deconvAttrs.aclFastMath);
     });
     return true;
 }
@@ -271,7 +271,8 @@ bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs,
                                                                                  &weiTensorInfo,
                                                                                  deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr,
                                                                                  &dstTensorInfo,
-                                                                                 deconv_info);
+                                                                                 deconv_info,
+                                                                                 deconvAttrs.aclFastMath);
         if (!status) {
             DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description());
             return false;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp b/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp
index 9528e5a5ef03e0..c632cc0cf99ad1 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp
@@ -22,6 +22,9 @@ struct DeconvAttrs {
     std::vector<ptrdiff_t> paddingR;
     ov::CoordinateDiff outputPadding;
     bool withBiasesParam = false;
+#if defined(OV_CPU_WITH_ACL)
+    bool aclFastMath = false;
+#endif
 };
 
 class DeconvExecutor {

From 0064022e0662b1fd8d169c2085b84271819939fb Mon Sep 17 00:00:00 2001
From: Xiuchuan Zhai <xiuchuan.zhai@intel.com>
Date: Mon, 21 Oct 2024 14:25:50 +0800
Subject: [PATCH 32/32] fix the coverity (#26963)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *153061*
---
 .../intel_cpu/src/nodes/fullyconnected.cpp    | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 5d2b6fd9b50212..7f6ed99b1173d7 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -107,13 +107,14 @@ void FullyConnected::needPrepareParamsForTensorParallel() {
         if (dim < 0) {
             dim += dims.size();
         }
-        assert(static_cast<int>(dims[dim]) >= tp_cfg.w_size);
+        OPENVINO_ASSERT(static_cast<int>(dims[dim]) >= tp_cfg.w_size,
+            getName() + " dim[" + std::to_string(dim) + "] is " + std::to_string(dims[dim]) + ", which is larger than w_size " + std::to_string(tp_cfg.w_size));
         auto splited_dim_vec = split_parts(dims[dim], tp_cfg.w_size);
 
-        VectorDims new_dims = dims;
+        VectorDims new_dims = std::move(dims);
         new_dims[dim] = splited_dim_vec[tp_cfg.w_rank];
         auto memory_desc = dst_desc->cloneWithNewDims(new_dims, true);
-        tp_cfg.cached_dst->redefineDesc(memory_desc);
+        tp_cfg.cached_dst->redefineDesc(std::move(memory_desc));
         memory[ARG_DST] = tp_cfg.cached_dst;
     }
 }
@@ -133,6 +134,7 @@ void FullyConnected::prepareParams() {
 void FullyConnected::initTensorParallelSync() {
     if (tp_cfg.enable_tensor_parallel) {
         tp_cfg.id = tp_cfg.sub_memory->get_memory_id(tp_cfg.w_rank);
+        OPENVINO_ASSERT(tp_cfg.id > 0, "Tensor Parallel Config ID cannot be negative.");
         tp_cfg.sub_memory->set_memory_used(tp_cfg.id, tp_cfg.w_rank);
         while (true) {
             std::lock_guard<std::mutex> lock(tp_cfg.sub_memory->_flagMutex);
@@ -155,7 +157,7 @@ void FullyConnected::execTensorParallelSync() {
         auto dst = getDstMemoryAtPort(0);
         auto dst_ptr = static_cast<uint8_t*>(dst->getData());
 
-        auto shape = dst->getShape();
+        auto& shape = dst->getShape();
         auto dims = shape.getDims();
         auto prec = dst->getPrecision();
 
@@ -240,6 +242,10 @@ bool FullyConnected::canFuse(const NodePtr& node) const {
 #endif
     if (node->getType() == Type::FakeQuantize) {
         auto* fq = dynamic_cast<FakeQuantize*>(node.get());
+        if (!fq) {
+            DEBUG_LOG("Invalid dynamic_cast FakeQuantize pointer");
+            return false;
+        }
         if (fq->getBroadcastingPolicy() != FakeQuantize::BroadcastingPolicy::PerTensor) {
             const auto& dstShape = getOutputShapeAtPort(0);
             auto dataRanks = dstShape.getRank();
@@ -377,7 +383,7 @@ void FullyConnected::needUpdateDQScaleForTensorParallel(std::vector<float>& dequ
         auto split_offset = tp_cfg.w_rank * split_lens[0];
         std::vector<float> newDQScales(split_lens[tp_cfg.w_rank]);
         std::copy(DQScales.begin() + split_offset, DQScales.begin() + split_offset + split_lens[tp_cfg.w_rank], newDQScales.begin());
-        dequantizationScales = newDQScales;
+        dequantizationScales = std::move(newDQScales);
     }
 }
 
@@ -448,21 +454,21 @@ void FullyConnected::needSplitMemoryForTensorParallel() {
         memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID);
         // wgt
         // split N direction
-        tp_cfg.cached_splited_weight = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), wgt, 0, tp_cfg.w_rank, tp_cfg.w_size)
-                    : split_horizontal(context->getEngine(), wgt, 0, tp_cfg.w_rank, tp_cfg.w_size);
+        tp_cfg.cached_splited_weight = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size)
+                    : split_horizontal(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size);
         memory[ARG_WEI] = tp_cfg.cached_splited_weight;
         // bias
         if (attrs.withBias) {
             auto bias = getSrcMemoryAtPort(BIAS_ID);
-            auto select_bias = split_horizontal(context->getEngine(), bias, 0, tp_cfg.w_rank, tp_cfg.w_size);
-            tp_cfg.cached_splited_bias = select_bias;
+            auto select_bias = split_horizontal(context->getEngine(), std::move(bias), 0, tp_cfg.w_rank, tp_cfg.w_size);
+            tp_cfg.cached_splited_bias = std::move(select_bias);
         } else {
             tp_cfg.cached_splited_bias = MemoryDescUtils::makeEmptyMemory(context);
         }
         memory[ARG_BIAS] = tp_cfg.cached_splited_bias;
         // dst
         memory[ARG_DST] = getDstMemoryAtPort(0);
-        tp_cfg.cached_dst = split_horizontal(context->getEngine(), dst, -1, tp_cfg.w_rank, tp_cfg.w_size, false);
+        tp_cfg.cached_dst = split_horizontal(context->getEngine(), std::move(dst), -1, tp_cfg.w_rank, tp_cfg.w_size, false);
     }
 }
 
@@ -471,7 +477,7 @@ void FullyConnected::needUpdateTensorParalelConfig() {
     // 1. weight shape is dynamic
     // 2. last dim can be splited.
     if (tp_cfg.enable_tensor_parallel) {
-        auto shape = getSrcMemoryAtPort(WEIGHTS_ID)->getShape();
+        auto& shape = getSrcMemoryAtPort(WEIGHTS_ID)->getShape();
         if (shape.isDynamic()) {
             tp_cfg.enable_tensor_parallel = false;
         } else if (shape.getDims()[0] < static_cast<size_t>(tp_cfg.w_size)) {
@@ -520,8 +526,8 @@ void FullyConnected::needUpdateScaleForTensorParallel() {
 void FullyConnected::needSplitScaleForTensorParallel(const MemoryCPtr& memory) {
     if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_scale) {
         auto scale_mem = std::const_pointer_cast<IMemory>(memory);
-        tp_cfg.cached_scale = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size)
-                       : split_horizontal(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size);
+        tp_cfg.cached_scale = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size)
+                       : split_horizontal(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size);
     }
 }
 
@@ -536,7 +542,7 @@ void FullyConnected::needSplitZeroPointForTensorParallel(const MemoryCPtr& memor
         auto zeropoint_mem = std::const_pointer_cast<IMemory>(memory);
         auto element_num = memory->getSize() / memory->getPrecision().size();
         if (element_num == 1) {
-            tp_cfg.cached_zeropoint = zeropoint_mem;
+            tp_cfg.cached_zeropoint = std::move(zeropoint_mem);
         } else {
             tp_cfg.cached_zeropoint = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size)
                                 : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size);