From b8ced3dfc7f1eb5a399afaecebb1fe6828316653 Mon Sep 17 00:00:00 2001
From: Maxim Vafin <maxim.vafin@intel.com>
Date: Fri, 7 Jun 2024 12:50:30 +0200
Subject: [PATCH 01/18] Change opt-125m-gptq model (#24899)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 tests/model_hub_tests/pytorch/test_hf_transformers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/model_hub_tests/pytorch/test_hf_transformers.py b/tests/model_hub_tests/pytorch/test_hf_transformers.py
index bd27638159b3da..caa49fb8d032c3 100644
--- a/tests/model_hub_tests/pytorch/test_hf_transformers.py
+++ b/tests/model_hub_tests/pytorch/test_hf_transformers.py
@@ -450,7 +450,7 @@ def load_model_with_default_class(name, **kwargs):
                                            ("google/tapas-large-finetuned-wtq", "tapas"),
                                            ("gpt2", "gpt2"),
                                            ("openai/clip-vit-large-patch14", "clip"),
-                                           ("OpenVINO/opt-125m-gptq", "opt"),
+                                           ("katuni4ka/opt-125m-gptq", "opt"),
                                            ])
     @pytest.mark.precommit
     def test_convert_model_precommit(self, name, type, ie_device):

From 0cc051debbb310f28854aa2cff9d948a17056d33 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Fri, 7 Jun 2024 09:53:13 +0400
Subject: [PATCH 02/18] [GPU] Don't add crop for unused out ports of Split op
 (#24877)

### Details:
- Currently we insert crop primitive for each out port of
(Variadic)Split ops, and if some of them are not used we just waste some
execution time. This patch modifies converter for Split to process ports
with real users only.
---
 src/plugins/intel_gpu/src/plugin/ops/split.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/plugins/intel_gpu/src/plugin/ops/split.cpp b/src/plugins/intel_gpu/src/plugin/ops/split.cpp
index be6a5f67c2f398..fc323033be94a9 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/split.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/split.cpp
@@ -65,6 +65,11 @@ static void CreateCommonSplitOp(ProgramBuilder& p, const std::shared_ptr<ov::Nod
         }
 
         for (size_t i = 0; i < op->get_output_size(); i++) {
+            const auto& users = op->get_output_target_inputs(i);
+            // don't add crop primitive if port is not used by anyone
+            if (users.size() == 0)
+                continue;
+
             auto cropPrim = cldnn::crop(get_layer_name(i),
                                         inputs,
                                         cldnn::tensor(1),

From 416c4ffb7e9a2bbfa7b5c283b546bc776e3eed05 Mon Sep 17 00:00:00 2001
From: barnasm1 <michal.barnas@intel.com>
Date: Fri, 7 Jun 2024 07:55:52 +0200
Subject: [PATCH 03/18] [CORE] depracate get/set_concatenation_axis functions
 (#24737)

### Details:
 - remove `set_concatenation_axis` function usage
 - replace `get_concatenation_axis` function usage

### Tickets:
- [CVS-94507](https://jira.devtools.intel.com/browse/CVS-94507)

### Related PR:
 - https://github.com/openvinotoolkit/openvino/pull/24383

---------

Co-authored-by: Michal Lukaszewski <michal.lukaszewski@intel.com>
---
 .../src/move_fake_quantize.cpp                    | 12 ++++++++++--
 .../src/network_helper.cpp                        | 12 ++++++++++--
 .../src/pruning/propagate_masks.cpp               |  6 +++++-
 .../common_optimizations/concat_to_broadcast.cpp  |  4 ++--
 .../symbolic_transformations/dereshape_matmul.cpp |  6 ++++--
 .../symbol_optimization.cpp                       |  6 +++++-
 .../transpose_sinking/ts_concat.cpp               | 15 +++++++++------
 src/core/include/openvino/op/concat.hpp           |  4 ++++
 .../include/concat_shape_inference.hpp            |  2 +-
 src/core/src/op/concat.cpp                        |  6 +-----
 src/core/tests/copy.cpp                           |  2 +-
 src/plugins/intel_gpu/src/graph/concatenation.cpp |  2 +-
 12 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/src/common/low_precision_transformations/src/move_fake_quantize.cpp b/src/common/low_precision_transformations/src/move_fake_quantize.cpp
index 1b3fbec4074ade..bce8ca3bb684e3 100644
--- a/src/common/low_precision_transformations/src/move_fake_quantize.cpp
+++ b/src/common/low_precision_transformations/src/move_fake_quantize.cpp
@@ -9,6 +9,7 @@
 
 #include <memory>
 #include "openvino/core/node.hpp"
+#include "openvino/core/validation_util.hpp"
 #include "openvino/opsets/opset1.hpp"
 #include "openvino/pass/pattern/op/or.hpp"
 
@@ -83,10 +84,17 @@ bool MoveFakeQuantize::transform(TransformationContext& context, ov::pass::patte
     if (concat_node == nullptr) {
         return false;
     }
-    const auto concat_axis = concat_node->get_concatenation_axis();
+
+    const auto rank = concat_node->get_output_partial_shape(0).rank();
+    if (rank.is_dynamic()) {
+        return false;
+    }
+
+    const auto concat_axis = ov::util::normalize(concat_node->get_axis(), rank.get_length());
+
     for (size_t i = 0; i < 4; i++) {
         curr_constants[i] = as_type_ptr<opset1::Constant>(fq->get_input_node_shared_ptr(i + 1));
-        if (!multi_chanels && concat_axis >= 0 && curr_constants[i]->get_shape().size() > static_cast<size_t>(concat_axis)
+        if (!multi_chanels && curr_constants[i]->get_shape().size() > static_cast<size_t>(concat_axis)
             && curr_constants[i]->get_shape()[concat_axis] != 1) {
             multi_chanels = true;
         }
diff --git a/src/common/low_precision_transformations/src/network_helper.cpp b/src/common/low_precision_transformations/src/network_helper.cpp
index 58561cb1482329..d0a7470a6cf300 100644
--- a/src/common/low_precision_transformations/src/network_helper.cpp
+++ b/src/common/low_precision_transformations/src/network_helper.cpp
@@ -1561,7 +1561,11 @@ NetworkHelper::InsertDequantizationResult NetworkHelper::moveDequantizationBefor
     std::vector<std::vector<std::shared_ptr<ov::opset1::Constant>>> multiplyConstants, subtractConstants;
     if (is_type<ov::opset1::Concat>(operation)) {
         const auto concatNode = as_type_ptr<ov::opset1::Concat>(operation);
-        auto axis = concatNode->get_concatenation_axis();
+        int64_t axis = -1;
+        if (concatNode->get_output_partial_shape(0).rank().is_static()) {
+            const auto rank = concatNode->get_output_partial_shape(0).rank().get_length();
+            axis = ov::util::normalize(concatNode->get_axis(), rank);
+        }
         if (dequantization.multiply && dequantization.multiplyConstant->get_shape().size() > 1 && dequantization.multiplyConstant->get_shape()[axis] != 1) {
             multiplyConstants = NetworkHelper::splitConstantsBeforeConcat(operation, { dequantization.multiplyConstant });
         }
@@ -1659,7 +1663,11 @@ std::vector<std::vector<std::shared_ptr<ov::opset1::Constant>>> NetworkHelper::s
     std::vector<std::vector<std::shared_ptr<ov::opset1::Constant>>> newConstants(currConstants.size());
     auto number_of_concat_inputs = concat->get_input_size();
     const auto concatNode = as_type_ptr<ov::opset1::Concat>(concat);
-    const auto concat_axis = concatNode->get_concatenation_axis();
+        int64_t concat_axis = -1;
+        if (concatNode->get_output_partial_shape(0).rank().is_static()) {
+            const auto rank = concatNode->get_output_partial_shape(0).rank().get_length();
+            concat_axis = ov::util::normalize(concatNode->get_axis(), rank);
+        }
     std::vector<int64_t> shape_axis(number_of_concat_inputs);
     for (size_t i{ 0 }; i < number_of_concat_inputs; ++i) {
         auto shape = concat->get_input_partial_shape(i);
diff --git a/src/common/offline_transformations/src/pruning/propagate_masks.cpp b/src/common/offline_transformations/src/pruning/propagate_masks.cpp
index 3d22feafa60b1d..cc245e5ddd7fb7 100644
--- a/src/common/offline_transformations/src/pruning/propagate_masks.cpp
+++ b/src/common/offline_transformations/src/pruning/propagate_masks.cpp
@@ -765,7 +765,11 @@ class ov::pass::mask_propagation::Concat : public MatcherPass {
             if (!concat_ptr) {
                 return false;
             }
-            auto axis = concat_ptr->get_concatenation_axis();
+            int64_t axis = -1;
+            if (concat_ptr->get_output_partial_shape(0).rank().is_static()) {
+                const auto rank = concat_ptr->get_output_partial_shape(0).rank().get_length();
+                axis = ov::util::normalize(concat_ptr->get_axis(), rank);
+            }
 
             auto inputs = concat_ptr->inputs();
             std::map<int64_t, ov::Mask::Ptr> input_masks;
diff --git a/src/common/transformations/src/transformations/common_optimizations/concat_to_broadcast.cpp b/src/common/transformations/src/transformations/common_optimizations/concat_to_broadcast.cpp
index db2daca441a700..ec72c16cca77b1 100644
--- a/src/common/transformations/src/transformations/common_optimizations/concat_to_broadcast.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/concat_to_broadcast.cpp
@@ -15,7 +15,7 @@
 static bool use_broadcast(const std::shared_ptr<ov::op::v0::Concat>& concat) {
     const auto& output = concat->output(0);
     const auto& input = concat->input(0);
-    const auto& input_concat_dim = input.get_partial_shape()[concat->get_concatenation_axis()];
+    const auto& input_concat_dim = input.get_partial_shape()[concat->get_axis()];
 
     return input_concat_dim.is_static() && input_concat_dim.get_length() == 1 && output.get_partial_shape().is_static();
 }
@@ -87,4 +87,4 @@ ov::pass::ConcatToBroadcast::ConcatToBroadcast() {
 
     auto m = std::make_shared<pattern::Matcher>(concat_label, matcher_name);
     this->register_matcher(m, callback);
-}
\ No newline at end of file
+}
diff --git a/src/common/transformations/src/transformations/symbolic_transformations/dereshape_matmul.cpp b/src/common/transformations/src/transformations/symbolic_transformations/dereshape_matmul.cpp
index f5ee0222c5fcdc..d2585b08f853b4 100644
--- a/src/common/transformations/src/transformations/symbolic_transformations/dereshape_matmul.cpp
+++ b/src/common/transformations/src/transformations/symbolic_transformations/dereshape_matmul.cpp
@@ -29,7 +29,9 @@ bool concat_predicate(ov::Output<ov::Node> output) {
     const auto& concat = ov::as_type_ptr<ov::op::v0::Concat>(output.get_node_shared_ptr());
     if (!concat)
         return false;
-    return concat->get_concatenation_axis() >= output_pshape.rank().get_length() - 2;
+
+    const auto norm_axis = ov::util::normalize(concat->get_axis(), output_pshape.rank().get_length());
+    return norm_axis >= output_pshape.rank().get_length() - 2;
 }
 
 bool last_two_dims_are_equal(const ov::PartialShape& lhs, const ov::PartialShape& rhs) {
@@ -136,7 +138,7 @@ void pull_reshape_through_optional_concat_and_bea(const ov::pass::pattern::Patte
         OPENVINO_ASSERT(concat_node != nullptr,
                         "DeReshapeMatMul transformation matched operation which should be Concat -- but it is not");
         auto rank = concat_node->get_output_partial_shape(0).rank().get_length();
-        auto axis = (concat_node->get_concatenation_axis() == (rank - 1)) ? -1 : -2;
+        auto axis = (ov::util::normalize(concat_node->get_axis(), rank) == (rank - 1)) ? -1 : -2;
 
         auto idx_of_reshape_input = reshape_output == concat_node->input_value(0) ? 0 : 1;
         auto idx_of_non_reshape_input = static_cast<size_t>(!idx_of_reshape_input);
diff --git a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
index f779c73d86e487..1a4507c08dc9f0 100644
--- a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
+++ b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
@@ -7,6 +7,7 @@
 #include "itt.hpp"
 #include "openvino/core/bound_evaluation_util.hpp"
 #include "openvino/core/rt_info.hpp"
+#include "openvino/core/validation_util.hpp"
 #include "openvino/op/add.hpp"
 #include "openvino/op/concat.hpp"
 #include "openvino/op/convert.hpp"
@@ -163,7 +164,10 @@ ov::Output<ov::Node> alternative_source_from_concat_input_sources(const STS_map&
         if (!concat || concat->get_input_size() != 2)
             return alternative_source;
         int64_t idx = get_idx_of_symbol_in_source(source, symbol);
-        if (idx == -1 || idx != concat->get_concatenation_axis())
+        if (idx == -1)
+            return alternative_source;
+        const auto rank = source.get_partial_shape().rank().get_length();
+        if (idx != ov::util::normalize(concat->get_axis(), rank))
             return alternative_source;
         // optimize using the knowledge of the Concat SI and what happens on the axis
         const auto& lhs_pshape = concat->get_input_partial_shape(0);
diff --git a/src/common/transformations/src/transformations/transpose_sinking/ts_concat.cpp b/src/common/transformations/src/transformations/transpose_sinking/ts_concat.cpp
index 0f7627b7c1c910..8dbcf7ba285f5b 100644
--- a/src/common/transformations/src/transformations/transpose_sinking/ts_concat.cpp
+++ b/src/common/transformations/src/transformations/transpose_sinking/ts_concat.cpp
@@ -5,6 +5,7 @@
 #include "transformations/transpose_sinking/ts_concat.hpp"
 
 #include "itt.hpp"
+#include "openvino/core/validation_util.hpp"
 #include "openvino/op/concat.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/transpose.hpp"
@@ -35,8 +36,7 @@ TSConcatForward::TSConcatForward() {
             return false;
         }
 
-        auto concat_axis = concat_node->get_concatenation_axis();
-        if (concat_axis < 0) {
+        if (concat_node->get_output_partial_shape(0).is_dynamic()) {
             return false;
         }
         // todo: support dyn rank case
@@ -45,10 +45,12 @@ TSConcatForward::TSConcatForward() {
             return false;
         }
 
+        const auto rank = concat_node->get_output_partial_shape(0).rank().get_length();
+        const auto concat_axis = ov::util::normalize(concat_node->get_axis(), rank);
+
         const auto transpose_axis_order = transpose_info.transpose_const->get_axis_vector_val();
         const int64_t transposed_concat_axis = transpose_axis_order[concat_axis];
         concat_node->set_axis(transposed_concat_axis);
-        concat_node->set_concatenation_axis(-1);
 
         default_outputs_update(main_node, transpose_info);
         return true;
@@ -81,11 +83,13 @@ TSConcatBackward::TSConcatBackward() {
         }
 
         auto concat_node = as_type_ptr<ov::op::v0::Concat>(main_node);
-        auto concat_axis = concat_node->get_concatenation_axis();
-        if (concat_axis < 0) {
+        if (concat_node->get_output_partial_shape(0).is_dynamic()) {
             return false;
         }
 
+        const auto rank = concat_node->get_output_partial_shape(0).rank().get_length();
+        auto concat_axis = ov::util::normalize(concat_node->get_axis(), rank);
+
         const auto transpose_axis_order = transpose_const->get_axis_vector_val();
         const auto reversed_transpose_axis_order = ReverseTransposeOrder(transpose_axis_order);
         if (static_cast<int64_t>(reversed_transpose_axis_order.size()) <= concat_axis) {
@@ -94,7 +98,6 @@ TSConcatBackward::TSConcatBackward() {
 
         const auto transposed_concat_axis = reversed_transpose_axis_order[concat_axis];
         concat_node->set_axis(static_cast<int64_t>(transposed_concat_axis));
-        concat_node->set_concatenation_axis(-1);
 
         for (auto& new_node : sink_backward::InsertTransposeBeforeNode(main_node, transpose_const)) {
             register_new_node(new_node);
diff --git a/src/core/include/openvino/op/concat.hpp b/src/core/include/openvino/op/concat.hpp
index 5fa76967326a53..0e8fa67c54dfae 100644
--- a/src/core/include/openvino/op/concat.hpp
+++ b/src/core/include/openvino/op/concat.hpp
@@ -37,9 +37,13 @@ class OPENVINO_API Concat : public Op {
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
     /// \return The concatenation axis.
+    OPENVINO_DEPRECATED("The function get_concatenation_axis() is deprecated. Will be removed in 2025.0 release. Use "
+                        "get_axis() instead.")
     int64_t get_concatenation_axis() const {
         return m_concat_axis;
     }
+    OPENVINO_DEPRECATED("The function set_concatenation_axis() is deprecated. Will be removed in 2025.0 release. Use "
+                        "set_axis() instead.")
     void set_concatenation_axis(int64_t concatenation_axis) {
         m_concat_axis = concatenation_axis;
     }
diff --git a/src/core/shape_inference/include/concat_shape_inference.hpp b/src/core/shape_inference/include/concat_shape_inference.hpp
index 1f5d04b52fd7c3..9978a79871111b 100644
--- a/src/core/shape_inference/include/concat_shape_inference.hpp
+++ b/src/core/shape_inference/include/concat_shape_inference.hpp
@@ -17,7 +17,7 @@ std::vector<TRShape> shape_infer(const Concat* op, const std::vector<T>& input_s
     NODE_VALIDATION_CHECK(op, !input_shapes.empty());
     using DimType = typename T::value_type;
 
-    auto concat_axis = op->get_concatenation_axis() < 0 ? op->get_axis() : op->get_concatenation_axis();
+    auto concat_axis = op->get_axis();
     const auto empty_dim = DimType{};
 
     auto concat_dim = DimType{0};
diff --git a/src/core/src/op/concat.cpp b/src/core/src/op/concat.cpp
index 61c49d258b28a3..de24245fa28458 100644
--- a/src/core/src/op/concat.cpp
+++ b/src/core/src/op/concat.cpp
@@ -39,11 +39,7 @@ void Concat::validate_and_infer_types() {
         input_shapes.push_back(get_input_partial_shape(i));
     }
 
-    const auto output_shapes = shape_infer(this, input_shapes);
-    const auto& output_shape = output_shapes[0];
-    if (output_shape.rank().is_static() && (get_concatenation_axis() < 0)) {
-        set_concatenation_axis(ov::util::normalize(get_axis(), output_shape.size()));
-    }
+    const auto output_shape = shape_infer(this, input_shapes).front();
 
     set_output_type(0, inputs_et, output_shape);
 }
diff --git a/src/core/tests/copy.cpp b/src/core/tests/copy.cpp
index 5456374ca7647d..106cb20fdfbfb0 100644
--- a/src/core/tests/copy.cpp
+++ b/src/core/tests/copy.cpp
@@ -142,7 +142,7 @@ TEST(copy, concat) {
 
     ASSERT_TRUE(nullptr != new_node);
     ASSERT_TRUE(new_args == new_node->input_values());
-    ASSERT_TRUE(node_cast->get_concatenation_axis() == axis);
+    ASSERT_TRUE(node_cast->get_axis() == axis);
 }
 
 TEST(copy, constant) {
diff --git a/src/plugins/intel_gpu/src/graph/concatenation.cpp b/src/plugins/intel_gpu/src/graph/concatenation.cpp
index ae4a8828d21b8f..b493bb217b1c32 100644
--- a/src/plugins/intel_gpu/src/graph/concatenation.cpp
+++ b/src/plugins/intel_gpu/src/graph/concatenation.cpp
@@ -69,7 +69,7 @@ std::vector<layout> concatenation_inst::calc_output_layouts(const concatenation_
     }
     ov::op::v0::Concat op;
     op.set_friendly_name(desc->id);
-    op.set_concatenation_axis(axis_index);
+    op.set_axis(axis_index);
     std::vector<ShapeType> output_shapes = ov::op::v0::shape_infer(&op, input_shapes);
     return { layout {output_shapes[0], output_dt, output_format} };
 }

From 00510e69fdf452e36ef3c50a976fe501064ad49c Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Fri, 7 Jun 2024 09:29:01 +0300
Subject: [PATCH 04/18] [Snippets] Fixed Invalid Read in FuseLoops pass
 (#24880)

### Details:
- *Fixed Invalid Read in FuseLoops pass. Before we save loop ports by
reference from `current_loop_info`. However, this `current_loop_Info`
might be changed (another shared pointer) and the previous saved loop
ports by reference are invalid (the original `current_loop_info` is
expired)*
 - *`Valgrind` doesn't throw exception anymore with this fix*

### Tickets:
 - *N/A*
---
 src/common/snippets/src/lowered/pass/fuse_loops.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp
index ec0743bf4df7d0..baf6e05c45c3ef 100644
--- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp
@@ -220,10 +220,11 @@ bool FuseLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, l
                 // Loop_0 (Upper)                 |
                 //   |               =>           |
                 // Loop_1 (Current)     Loop_0 + Loop_1 => new `Loop_1`
-                const auto& input_ports = current_loop_info->get_input_ports();
+                // Make a copy of `input_ports` (not ref), since current_loop_info might be changed and ref will be invalid
+                const auto input_ports = current_loop_info->get_input_ports();
                 bool was_fusion_up = false;
-                for (size_t in_port = 0; in_port < input_ports.size() && !was_fusion_up; ++in_port) {
-                    const auto input_port = input_ports[in_port];
+                for (size_t in_port = 0; !was_fusion_up && in_port < input_ports.size(); ++in_port) {
+                    const auto& input_port = input_ports[in_port];
                     const auto parent_expr_output = *input_port.expr_port->get_connected_ports().begin();
                     const auto& parent_expr = parent_expr_output.get_expr();
                     const auto parent = parent_expr->get_node();
@@ -265,10 +266,10 @@ bool FuseLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, l
                 // Loop_0 (Current)    Loop_0 + Loop_1 => new `Loop_0`
                 //   |               =>           |
                 // Loop_1 (Lower)                 |
-                const auto& output_ports = current_loop_info->get_output_ports();
                 bool was_fusion_down = false;
-                for (size_t out_port = 0; out_port < output_ports.size() && !was_fusion_down; ++out_port) {
-                    const auto output_port = output_ports[out_port];
+                const auto& output_ports = current_loop_info->get_output_ports();
+                for (size_t out_port = 0; !was_fusion_down && out_port < output_ports.size(); ++out_port) {
+                    const auto& output_port = output_ports[out_port];
                     const auto consumer_exprs_inputs = output_port.expr_port->get_connected_ports();
                     for (const auto& consumer_expr_input : consumer_exprs_inputs) {
                         const auto& consumer_expr = consumer_expr_input.get_expr();

From faa19e6121286f4cff56849bb65d8b3b5976d617 Mon Sep 17 00:00:00 2001
From: hyunback kim <hyunback.kim@intel.com>
Date: Fri, 7 Jun 2024 15:32:38 +0900
Subject: [PATCH 05/18] [GPU] Fix wrong condition for dynamic onednn
 convolution impl-type. (#24497)

Remove the limitation that oneDNN convolution doesn't support dynamic
input and output.
With this limitation, clDNN convolution will run forever in dpas
platform.
Because oneDNN Opt convolutions require blocked-format and current shape
agnostic convolution doesn't support block-format, convolution do
nothing in compile_graph stage and actual kernel compilation will happen
at first inference.

### Tickets:
 - *141040*

Signed-off-by: hyunback <hyunback.kim@intel.com>
---
 .../intel_gpu/src/graph/layout_optimizer.cpp        | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index 460ed9b0301e0c..1f5d02ed13ccb2 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -894,11 +894,6 @@ static bool is_node_for_onednn(convolution_node const& node) {
     if (!layout_optimizer::are_data_types_suitable_for_onednn((program_node&)node))
         return false;
 
-    auto input_layout = node.get_input_layout(0);
-    auto output_layout = node.get_output_layout(0);
-    if (input_layout.is_dynamic() || output_layout.is_dynamic())
-        return false;
-
     return true;
 }
 
@@ -907,9 +902,6 @@ static bool is_node_for_onednn(deconvolution_node const& node) {
     auto input_layout = node.get_input_layout(0);
     auto output_layout = node.get_output_layout(0);
 
-    if (input_layout.is_dynamic() || output_layout.is_dynamic())
-        return false;
-
     bool onednn_valid_dt = layout_optimizer::are_data_types_suitable_for_onednn((program_node&)node);
 
     bool onednn_valid_params = onednn_valid_dt &&
@@ -1694,6 +1686,11 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
             impl_candidate = impl_types::ocl;
         }
 
+        if (node.is_type<convolution>()) {
+            if (!is_node_for_onednn(node.as<convolution>()))
+                impl_candidate = impl_types::ocl;
+        }
+
         if (node.is_type<deconvolution>()) {
             if (!is_node_for_onednn(node.as<deconvolution>()))
                 impl_candidate = impl_types::ocl;

From d286268a856208ca44ddae7ed0b9bded565d5804 Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mingyu.kim@intel.com>
Date: Fri, 7 Jun 2024 15:38:58 +0900
Subject: [PATCH 06/18] [GPU][DOC] Link to OCL article (#24813)

---
 src/plugins/intel_gpu/docs/gpu_plugin_driver_troubleshooting.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/docs/gpu_plugin_driver_troubleshooting.md b/src/plugins/intel_gpu/docs/gpu_plugin_driver_troubleshooting.md
index e3f7c07752dd03..ee97b2060ecae3 100644
--- a/src/plugins/intel_gpu/docs/gpu_plugin_driver_troubleshooting.md
+++ b/src/plugins/intel_gpu/docs/gpu_plugin_driver_troubleshooting.md
@@ -75,7 +75,7 @@ install them from [OpenCL Git](https://github.com/KhronosGroup/OpenCL-Headers)
 
 
 ## See also
-
+ * [Overview for OpenCL on Linux and troubleshoot](https://bashbaug.github.io/opencl/2019/07/06/OpenCL-On-Linux.html)
  * [OpenVINO™ README](../../../../README.md)
  * [OpenVINO Core Components](../../../README.md)
  * [OpenVINO Plugins](../../README.md)

From 236e1062b290e2d2345f1d1c319e78f15e0a311d Mon Sep 17 00:00:00 2001
From: zhaohongbo <zhaohbcloud@126.com>
Date: Fri, 7 Jun 2024 15:06:10 +0800
Subject: [PATCH 07/18] [CPU] Optimize the unique operator (#24850)

### Details:
 - *Optimize  unique using HashMap when sorted=false*
 - *...*

### Tickets:
 - *ticket-id*
---
 src/plugins/intel_cpu/src/nodes/unique.cpp | 42 +++++++++-------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/unique.cpp b/src/plugins/intel_cpu/src/nodes/unique.cpp
index ad322756ab28e3..130213dfcb8703 100644
--- a/src/plugins/intel_cpu/src/nodes/unique.cpp
+++ b/src/plugins/intel_cpu/src/nodes/unique.cpp
@@ -225,41 +225,31 @@ void Unique::flattenTensorExec() {
             }
         }
     } else {
-        uniDataTmpPtr[0] = srcDataPtr[0];
-        if (definedOutputs[FIRST_UNIQUE_IDX]) {
-            firstTmpPtr[0] = 0;
-        }
-        if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
-            inToOutTmpPtr[0] = 0;
-        }
+        std::unordered_map<T, int32_t> uniq;
+        uniq.reserve(inputLen);
+
         if (definedOutputs[OCCURRENCES_NUM]) {
             std::fill(occurTmpPtr, occurTmpPtr + inputLen, 1);
         }
-        uniqueLen = 1;
-
-        for (size_t i = 1; i < inputLen; i++) {
-            bool found = false;
-            size_t j = 0;
-            for (; j < uniqueLen; j++) {
-                if (uniDataTmpPtr[j] == srcDataPtr[i]) {
-                    found = true;
-                    break;
-                }
-            }
-            if (!found) {
-                uniDataTmpPtr[uniqueLen] = srcDataPtr[i];
+
+        for (size_t i = 0, j = 0; i < inputLen; ++i) {
+            auto it = uniq.emplace(srcDataPtr[i], j);
+            inToOutTmpPtr[i] = it.first->second;
+            if (it.second) {
                 if (definedOutputs[FIRST_UNIQUE_IDX]) {
-                    firstTmpPtr[uniqueLen] = i;
+                    firstTmpPtr[j] = i;
                 }
-                uniqueLen++;
+                ++j;
             } else {
                 if (definedOutputs[OCCURRENCES_NUM]) {
-                    occurTmpPtr[j]++;
+                    occurTmpPtr[inToOutTmpPtr[i]]++;
                 }
             }
-            if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
-                inToOutTmpPtr[i] = j;
-            }
+        }
+
+        uniqueLen = static_cast<int64_t>(uniq.size());
+        for (const auto& it : uniq) {
+            uniDataTmpPtr[it.second] = it.first;
         }
     }
 

From 7f1ddd55acfbc1d2a0601f35e4f0abb9279057aa Mon Sep 17 00:00:00 2001
From: Tomasz Jankowski <tomasz1.jankowski@intel.com>
Date: Fri, 7 Jun 2024 10:14:32 +0200
Subject: [PATCH 08/18] [Core/Ref] Resolve coverity issues (#24874)

### Details:
 - Fixed coverity issues in src/core/reference

### Tickets:
 - CVS-143152
---
 .../include/openvino/reference/atanh.hpp      |  9 ++++++--
 .../include/openvino/reference/matmul.hpp     |  4 ++--
 src/core/reference/src/op/einsum.cpp          | 20 ++++++++--------
 src/core/reference/src/op/fft.cpp             | 23 +++++++------------
 src/core/reference/src/op/interpolate.cpp     | 20 ++++++++--------
 src/core/reference/src/op/loop.cpp            |  2 +-
 6 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/src/core/reference/include/openvino/reference/atanh.hpp b/src/core/reference/include/openvino/reference/atanh.hpp
index 5ba554d55179e3..56be82694d55e4 100644
--- a/src/core/reference/include/openvino/reference/atanh.hpp
+++ b/src/core/reference/include/openvino/reference/atanh.hpp
@@ -18,9 +18,9 @@ T atanh(const T in) {
     return std::atanh(in);
 }
 
-template <class T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+// Integral types don't support NAN and INFINITY, use integral limits instead for special values.
+template <class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value>::type* = nullptr>
 T atanh(const T in) {
-    // Integral type not support NAN and INFINITY, use integral limits instead for special values.
     if (in > 0) {
         return std::numeric_limits<T>::max();
     } else if (in < 0) {
@@ -29,6 +29,11 @@ T atanh(const T in) {
         return 0;
     }
 }
+
+template <class T, typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
+T atanh(const T in) {
+    return in > 0 ? std::numeric_limits<T>::max() : 0;
+}
 }  // namespace func
 
 /**
diff --git a/src/core/reference/include/openvino/reference/matmul.hpp b/src/core/reference/include/openvino/reference/matmul.hpp
index 964bbc5c4a264d..92d6fa3cefb6b6 100644
--- a/src/core/reference/include/openvino/reference/matmul.hpp
+++ b/src/core/reference/include/openvino/reference/matmul.hpp
@@ -161,7 +161,7 @@ void matmul(const T* arg0,
                           broadcast_axes,
                           sizeof(T));
 
-                arg0_shape_tmp = arg0_br_target_shape;
+                arg0_shape_tmp = std::move(arg0_br_target_shape);
                 arg0_rank = arg0_shape_tmp.size();
                 arg0_new_data.swap(tmp);
                 arg0_data = arg0_new_data.data();
@@ -175,7 +175,7 @@ void matmul(const T* arg0,
                           arg1_br_target_shape,
                           broadcast_axes,
                           sizeof(T));
-                arg1_shape_tmp = arg1_br_target_shape;
+                arg1_shape_tmp = std::move(arg1_br_target_shape);
                 arg1_rank = arg1_shape_tmp.size();
                 arg1_new_data.swap(tmp);
                 arg1_data = arg1_new_data.data();
diff --git a/src/core/reference/src/op/einsum.cpp b/src/core/reference/src/op/einsum.cpp
index 74027f424ecb7a..b8b23964346225 100644
--- a/src/core/reference/src/op/einsum.cpp
+++ b/src/core/reference/src/op/einsum.cpp
@@ -124,7 +124,7 @@ std::unordered_map<std::string, std::vector<size_t>> compute_label_dim_map(const
             for (size_t ind = 0; ind < num_broadcasted_dims; ++ind) {
                 label_dims.push_back(static_cast<size_t>(current_dim + ind));
             }
-            resulted_map[label] = label_dims;
+            resulted_map[label] = std::move(label_dims);
             current_dim += num_broadcasted_dims;
         } else if (resulted_map.find(label) != resulted_map.end()) {
             resulted_map[label].push_back(static_cast<size_t>(current_dim));
@@ -132,7 +132,7 @@ std::unordered_map<std::string, std::vector<size_t>> compute_label_dim_map(const
         } else {
             std::vector<size_t> label_dims;
             label_dims.push_back(static_cast<size_t>(current_dim));
-            resulted_map[label] = label_dims;
+            resulted_map[label] = std::move(label_dims);
             ++current_dim;
         }
     }
@@ -350,8 +350,8 @@ void reduce_input(ov::TensorVector& inputs,
     reference::reduce_sum(input_ptr.data<T>(), output_ptr.data<T>(), input_shape, reduced_axes);
 
     // update a vector of inputs and input subscripts
-    inputs[input_ind] = output_ptr;
-    input_subscripts[input_ind] = new_input_subscript;
+    inputs[input_ind] = std::move(output_ptr);
+    input_subscripts[input_ind] = std::move(new_input_subscript);
 }
 
 /// \brief      Transpose input to layout specified through the required subscript
@@ -408,7 +408,7 @@ void transpose_input(ov::TensorVector& inputs,
                          output_shape);
 
     // update a vector of inputs and input subscripts
-    inputs[input_ind] = output_ptr;
+    inputs[input_ind] = std::move(output_ptr);
     input_subscripts[input_ind] = required_subscript;
 }
 
@@ -452,7 +452,7 @@ void broadcast_input(ov::TensorVector& inputs,
                          broadcast_axes,
                          input.get_element_type().size());
 
-    input = output;
+    input = std::move(output);
 }
 
 /// \brief      Build identity tensor that will be used to zero non-diagonal tensor
@@ -528,7 +528,7 @@ ov::Tensor build_multi_identity(const ov::Tensor& input,
                                multi_identity.get_shape(),
                                identity.get_shape(),
                                ov::op::AutoBroadcastType::NUMPY);
-        multi_identity = mul_output;
+        multi_identity = std::move(mul_output);
     }
     return multi_identity;
 }
@@ -545,7 +545,7 @@ void extract_diagonal(ov::TensorVector& inputs, std::vector<std::string>& input_
 
     const auto& input_ptr = inputs[input_ind];
     const auto& input_subscript = input_subscripts[input_ind];
-    const auto input_shape = input_ptr.get_shape();
+    const auto& input_shape = input_ptr.get_shape();
 
     std::string resultant_subscript = "";
     constexpr char ellipsis[] = "...";
@@ -591,8 +591,8 @@ void extract_diagonal(ov::TensorVector& inputs, std::vector<std::string>& input_
 
     auto result = ov::Tensor(input_ptr.get_element_type(), result_shape);
     reference::reduce_sum(mul_output.data<T>(), result.data<T>(), mul_output.get_shape(), reduced_axes);
-    inputs[input_ind] = result;
-    input_subscripts[input_ind] = resultant_subscript;
+    inputs[input_ind] = std::move(result);
+    input_subscripts[input_ind] = std::move(resultant_subscript);
 }
 
 /// \brief      Reshape input to the new shape specified by sub-shapes of the
diff --git a/src/core/reference/src/op/fft.cpp b/src/core/reference/src/op/fft.cpp
index 9c88b21fd8d1b8..1e0c04eb4c4e35 100644
--- a/src/core/reference/src/op/fft.cpp
+++ b/src/core/reference/src/op/fft.cpp
@@ -306,7 +306,8 @@ InfoForFFTCalculation get_info_for_calculation(const Shape& input_data_shape,
     const int64_t complex_data_rank = static_cast<int64_t>(input_data_shape.size() - 1);
 
     const auto reversed_output_shape = fft_common::reverse_shape_of_emulated_complex_tensor(output_shape);
-    auto fft_axes = get_axes(axes_data, axes_data_shape, complex_data_rank);
+    auto& fft_axes = result.fft_axes;
+    fft_axes = get_axes(axes_data, axes_data_shape, complex_data_rank);
     fft_axes = fft_common::reverse_fft_axes(fft_axes, complex_data_rank);
 
     const int64_t fft_rank = fft_axes.size();
@@ -320,30 +321,22 @@ InfoForFFTCalculation get_info_for_calculation(const Shape& input_data_shape,
     const auto outer_strides = fft_common::compute_strides(outer_lengths);
     const int64_t outer_size = outer_strides[outer_rank];
 
-    const int64_t buffer_size = compute_buffer_size(fft_lengths);
-
     const auto output_strides = fft_common::compute_strides(reversed_output_shape);
-    const auto output_fft_strides = get_lengths(output_strides, fft_axes);
-    const auto output_outer_strides = get_lengths(output_strides, outer_axes);
     const auto reversed_input_shape = fft_common::reverse_shape_of_emulated_complex_tensor(input_data_shape);
-    const auto input_fft_lengths = get_lengths(reversed_input_shape, fft_axes);
     const auto input_strides = fft_common::compute_strides(reversed_input_shape);
-    const auto input_fft_strides = get_lengths(input_strides, fft_axes);
-    const auto input_outer_strides = get_lengths(input_strides, outer_axes);
 
-    result.fft_axes = fft_axes;
     result.fft_lengths = fft_lengths;
     result.fft_strides = fft_strides;
     result.outer_strides = outer_strides;
-    result.output_fft_strides = output_fft_strides;
-    result.output_outer_strides = output_outer_strides;
-    result.input_fft_lengths = input_fft_lengths;
-    result.input_fft_strides = input_fft_strides;
-    result.input_outer_strides = input_outer_strides;
+    result.output_fft_strides = get_lengths(output_strides, fft_axes);
+    result.output_outer_strides = get_lengths(output_strides, outer_axes);
+    result.input_fft_lengths = get_lengths(reversed_input_shape, fft_axes);
+    result.input_fft_strides = get_lengths(input_strides, fft_axes);
+    result.input_outer_strides = get_lengths(input_strides, outer_axes);
     result.fft_rank = fft_rank;
     result.fft_size = fft_size;
     result.outer_size = outer_size;
-    result.buffer_size = buffer_size;
+    result.buffer_size = compute_buffer_size(fft_lengths);
 
     return result;
 }
diff --git a/src/core/reference/src/op/interpolate.cpp b/src/core/reference/src/op/interpolate.cpp
index 3b4adc340507cf..ff9bf20eb1a293 100644
--- a/src/core/reference/src/op/interpolate.cpp
+++ b/src/core/reference/src/op/interpolate.cpp
@@ -93,10 +93,10 @@ InterpolateEvalHelper::InfoForGenericLinearONNXMode InterpolateEvalHelper::get_i
     result.batch_size = batch_size;
     result.num_channels = num_channels;
     result.spatial_rank = static_cast<int64_t>(spatial_rank);
-    result.input_index_multipliers = input_index_multipliers;
-    result.output_index_multipliers = output_index_multipliers;
-    result.input_spatial_shape = input_spatial_shape;
-    result.output_spatial_shape = output_spatial_shape;
+    result.input_index_multipliers = std::move(input_index_multipliers);
+    result.output_index_multipliers = std::move(output_index_multipliers);
+    result.input_spatial_shape = std::move(input_spatial_shape);
+    result.output_spatial_shape = std::move(output_spatial_shape);
 
     return result;
 }
@@ -134,10 +134,10 @@ InterpolateEvalHelper::InfoForLinearMode InterpolateEvalHelper::get_info_for_lin
     InfoForLinearMode result;
 
     result.antialias = antialias;
-    result.a = a;
-    result.r = r;
+    result.a = std::move(a);
+    result.r = std::move(r);
     result.prod_a = prod_a;
-    result.shape_for_indices = shape_for_indices;
+    result.shape_for_indices = std::move(shape_for_indices);
 
     return result;
 }
@@ -163,8 +163,8 @@ InterpolateEvalHelper::ICoords InterpolateEvalHelper::get_icoords(const Coordina
         icoords_r[axis] = static_cast<int64_t>(std::round(in_coord));
     }
 
-    result.icoords = icoords;
-    result.icoords_r = icoords_r;
+    result.icoords = std::move(icoords);
+    result.icoords_r = std::move(icoords_r);
 
     return result;
 }
@@ -218,7 +218,7 @@ InterpolateEvalHelper::LinearModeInnerIterationResult InterpolateEvalHelper::inn
     Coordinate inner_coord{unsigned_inner_coords_vector};
 
     result.w = w;
-    result.inner_coord = inner_coord;
+    result.inner_coord = std::move(inner_coord);
 
     return result;
 }
diff --git a/src/core/reference/src/op/loop.cpp b/src/core/reference/src/op/loop.cpp
index f6cbae6ffaec46..17d9a57e538b93 100644
--- a/src/core/reference/src/op/loop.cpp
+++ b/src/core/reference/src/op/loop.cpp
@@ -51,7 +51,7 @@ void loop(const std::shared_ptr<Model>& func,
         ov::Tensor in_tensor(func->get_parameters().at(cur_iter_idx)->get_element_type(),
                              func->get_parameters().at(cur_iter_idx)->get_shape());
         std::memset(in_tensor.data(), 0, in_tensor.get_byte_size());
-        inputs_to_body.at(cur_iter_idx) = in_tensor;
+        inputs_to_body.at(cur_iter_idx) = std::move(in_tensor);
     }
 
     // Port map processing: inputs and back edges

From 852878162dda8c93db51ce4b1ad2069247e46bd2 Mon Sep 17 00:00:00 2001
From: Andrii Staikov <andrii.staikov@intel.com>
Date: Fri, 7 Jun 2024 10:23:27 +0200
Subject: [PATCH 09/18] [TRANSFORMATIONS] Add a check to SDPAToPagedAttention
 to fail fast (#24841)

[TRANSFORMATIONS] Add a check to SDPAToPagedAttention to fail fast

Add a check to SDPAToPagedAttention to fail fast if no
ScaledDotProductAttention operation present in a graph hence no
transformation will be executed.

### Tickets:
 - CVS-143067

Signed-off-by: Andrii Staikov <andrii.staikov@intel.com>

---------

Signed-off-by: Andrii Staikov <andrii.staikov@intel.com>
Co-authored-by: Ivan Tikhonov <ivan.tikhonov@intel.com>
---
 .../tests/sdpa_to_paged_attention_test.cpp    | 27 +++++++++++++++++++
 src/core/src/pass/sdpa_to_paged_attention.cpp |  6 +++++
 2 files changed, 33 insertions(+)
 create mode 100644 src/common/transformations/tests/sdpa_to_paged_attention_test.cpp

diff --git a/src/common/transformations/tests/sdpa_to_paged_attention_test.cpp b/src/common/transformations/tests/sdpa_to_paged_attention_test.cpp
new file mode 100644
index 00000000000000..0443e7b82de5cc
--- /dev/null
+++ b/src/common/transformations/tests/sdpa_to_paged_attention_test.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/pass/sdpa_to_paged_attention.hpp"
+
+#include <gtest/gtest.h>
+
+#include "common_test_utils/test_common.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+#include "openvino/pass/manager.hpp"
+
+using namespace ov;
+
+TEST(SDPATOPATest, SDPANotPresent) {
+    const auto p0 = std::make_shared<op::v0::Parameter>(element::f32, Shape{1, 32, 32});
+    const auto p1 = std::make_shared<op::v0::Parameter>(element::f32, Shape{1, 32, 32});
+    const auto add = std::make_shared<op::v1::Add>(p0, p1);
+    const auto result = std::make_shared<op::v0::Result>(add);
+
+    auto model = std::make_shared<Model>(ResultVector{result}, ParameterVector{p0, p1});
+
+    ov::pass::Manager manager;
+    manager.register_pass<pass::SDPAToPagedAttention>();
+    EXPECT_THROW(manager.run_passes(model), ov::Exception);
+}
\ No newline at end of file
diff --git a/src/core/src/pass/sdpa_to_paged_attention.cpp b/src/core/src/pass/sdpa_to_paged_attention.cpp
index 1eaf15c928db01..0d71c6a4b0d8dc 100644
--- a/src/core/src/pass/sdpa_to_paged_attention.cpp
+++ b/src/core/src/pass/sdpa_to_paged_attention.cpp
@@ -7,6 +7,7 @@
 #include "openvino/cc/pass/itt.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/gather.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
 #include "openvino/op/shape_of.hpp"
 #include "openvino/op/unsqueeze.hpp"
 #include "openvino/pass/manager.hpp"
@@ -29,6 +30,11 @@ static std::shared_ptr<v0::Parameter> setName(std::shared_ptr<v0::Parameter> nod
 
 bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptr<ov::Model>& model) {
     RUN_ON_MODEL_SCOPE(SDPAToPagedAttention);
+
+    OPENVINO_ASSERT(ov::op::util::has_op_with_type<ov::op::v13::ScaledDotProductAttention>(model),
+                    "No ScaledDotProductAttention operation observed in the graph, cannot perform"
+                    "the SDPAToPagedAttention transformation.");
+
     auto max_context_len = setName(std::make_shared<v0::Parameter>(element::i32, PartialShape{}), "max_context_len");
     ParameterVector model_remaining_params = {
         setName(std::make_shared<v0::Parameter>(element::i32, PartialShape{-1}), "past_lens"),

From 5faf95018ddc2b80a1721c8889f1b96fabc26e92 Mon Sep 17 00:00:00 2001
From: Karol Blaszczak <karol.blaszczak@intel.com>
Date: Fri, 7 Jun 2024 12:28:41 +0200
Subject: [PATCH 10/18] [DOCS] fix back to top button (#24884)

---
 .../static/css/openvino_sphinx_theme.css      |  7 ----
 docs/sphinx_setup/_static/css/custom.css      | 41 +++++++++++++++----
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css b/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css
index b180a5a096eaf3..8c038c795542e6 100644
--- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css
+++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/static/css/openvino_sphinx_theme.css
@@ -55,13 +55,6 @@ body {
     border-color: rgb(var(--ost-color-primary));
 }
 
-/* Scrollbox Extension */
-
-.scrollbox {
-    overflow-y:scroll;
-    height:300px;
-    margin-bottom: 20px;
-}
 
 /* Syntax Highlighting */
 
diff --git a/docs/sphinx_setup/_static/css/custom.css b/docs/sphinx_setup/_static/css/custom.css
index a9536c7aa05401..57e2b35a395e06 100644
--- a/docs/sphinx_setup/_static/css/custom.css
+++ b/docs/sphinx_setup/_static/css/custom.css
@@ -129,7 +129,7 @@ nav.bd-links li > a:hover {
     text-decoration: underline
 }
 
-ul#navbar-main-elements > li:hover { 
+ul#navbar-main-elements > li:hover {
     text-decoration: underline;
     color: #fff;
 }
@@ -223,7 +223,7 @@ details.sd-dropdown:not([open]).sd-card {
 /* Ttile is at the same place for both open and close states */
 .sd-card-header {
     border-radius: 0px !important;
-    
+
 }
 
 /* Ttile is at the same place for both open and close states */
@@ -262,7 +262,7 @@ details.sd-dropdown .sd-summary-title {
     min-width: 125px!important;
 }
 
-[aria-labelledby="version-selector"]  .dropdown-item { 
+[aria-labelledby="version-selector"]  .dropdown-item {
     padding: 0.25rem 0.5rem!important;
 }
 
@@ -437,21 +437,21 @@ div.highlight {
 /* =================================================== */
 @media (max-width: 720px) {
 
-    .container, 
+    .container,
     .container-lg,
     .container-md,
     .container-sm,
     .container-xl {
         max-width: 1850px;
     }
-    
+
     .transition-banner {
         margin-top: 2rem;
     }
 }
 
 @media (min-width: 1200px) {
-    .container, 
+    .container,
     .container-lg,
     .container-md,
     .container-sm,
@@ -921,6 +921,7 @@ div.highlight {
 
 
 /* Content formatting for the benchmark pages */
+/* =================================================== */
 .picker-options {
     margin: 15px 0;
 }
@@ -1223,7 +1224,7 @@ table#model-accuracy-and-perf-int8-fp32-table td.data {
 
 .newsletter-submit-btn:before {
     font-family: "Font Awesome 5 Free";
-    content: "\f0e0\00a0"; 
+    content: "\f0e0\00a0";
     font-size: 1rem;
 }
 
@@ -1307,3 +1308,29 @@ input:-webkit-autofill {
     -webkit-box-shadow: 0 0 0px 1000px white inset;
 }
 
+/* Scrollbox Extension */
+/* =================================================== */
+.scrollbox {
+    overflow-y:scroll;
+    height:300px;
+    margin-bottom: 20px;
+}
+
+/* overriding the 'back to top btn' style from webpack://pydata_sphinx_theme/src/pydata_sphinx_theme/assets/styles/base/_base.scss */
+/* =================================================== */
+#pst-back-to-top {
+    top: unset;
+    bottom: 3rem;
+    left: unset;
+    right: -2rem;
+    background-color: #0068b5;
+    font-size: .8rem;
+    border-radius: .25rem !important;
+}
+
+/* hide the header for the side menu */
+/* =================================================== */
+
+nav.bd-links p.bd-links__title {
+    display: none;
+}
\ No newline at end of file

From c54240547017e8eb5e8a136880727dd4daad3822 Mon Sep 17 00:00:00 2001
From: Evgenya Nugmanova <evgeniia.nugmanova@intel.com>
Date: Fri, 7 Jun 2024 14:34:53 +0400
Subject: [PATCH 11/18] More accurate shape of optimization (#24845)

### Details:
- *Topological order was corrected to perform the best ShapeOf
reconnections possible*

### Tickets:
 - *CVS-142953*
---
 .../fuse_rotary_positional_embeddings.cpp     |   2 +-
 .../symbol_optimization.cpp                   | 104 +++++++++++++++++-
 .../label_optimization.cpp                    |  18 +--
 3 files changed, 106 insertions(+), 18 deletions(-)

diff --git a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
index 2f3ddd5d843ae3..86507326c25a44 100644
--- a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
@@ -555,7 +555,7 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
         {{"special_zero", true}});
     auto slice_Slice_543 = GenSlice(view_Reshape_424, 0, head_size, 1, 3);  //  tensor_array<f32[?,?,32,128]>
 
-    auto hidden_states = makePattern("f32[?,?,?]");  //
+    auto hidden_states = makePattern();  //
     auto ShapeOf_485735 = makePattern<opset1::ShapeOf>({hidden_states}, {});
     auto Multiply_567524 = makePattern<opset1::Multiply>({ShapeOf_485735, {-1}}, {{"auto_broadcast", "numpy"}});
     auto Gather_377635 = makePattern<opset8::Gather>({Multiply_567524, {1}, 0}, {{"batch_dims", 0}});
diff --git a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
index 1a4507c08dc9f0..3bf315bebf4467 100644
--- a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
+++ b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
@@ -16,6 +16,7 @@
 #include "openvino/op/shape_of.hpp"
 #include "openvino/op/squeeze.hpp"
 #include "openvino/op/util/multi_subgraph_base.hpp"
+#include "openvino/op/util/op_types.hpp"
 #include "transformations/utils/utils.hpp"
 
 namespace {
@@ -222,7 +223,101 @@ void optimize_value_usage(ov::Output<ov::Node>& output, STS_map& symbol_shape_so
     }
 }
 
-void save_shape_sources(const ov::Output<ov::Node>& output, STS_map& symbol_shape_source) {
+std::vector<std::shared_ptr<ov::Node>> topological_order(const std::shared_ptr<ov::Model>& m) {
+    auto order = m->get_ordered_ops();
+
+    // step 1: split model into parameter related and parameter non-related ops
+    const std::string op_depends_on_parameter = "topological_sort_op_depends_on";
+    // values: true - parameter dependent; false otherwise
+    for (const auto& op : order) {
+        if (ov::as_type_ptr<ov::op::v0::Parameter>(op)) {
+            op->get_rt_info()[op_depends_on_parameter] = true;
+        } else if (ov::as_type_ptr<ov::op::v0::Constant>(op) || ov::as_type_ptr<ov::op::v0::ShapeOf>(op) ||
+                   ov::as_type_ptr<ov::op::v3::ShapeOf>(op) ||
+                   std::dynamic_pointer_cast<ov::op::util::VariableExtension>(op)) {
+            op->get_rt_info()[op_depends_on_parameter] = false;
+        } else {  // deduce op type from inputs
+            const auto& inputs = op->input_values();
+            op->get_rt_info()[op_depends_on_parameter] =
+                std::any_of(inputs.begin(),
+                            inputs.end(),
+                            [&op_depends_on_parameter](const ov::Output<ov::Node>& input) {
+                                return input.get_node_shared_ptr()->get_rt_info()[op_depends_on_parameter].as<bool>();
+                            });
+        }
+    }
+    // step 2: starting from Result -- assign weight to ops:
+    //      if parameter dependant, weights is maximum of output indices plus one
+    //      else weights is maximum of output indices
+    // this step doesn't assign weights to all the ops, this is intentional and will be used in the following step
+    const std::string weight_rt_info_name = "topological_sort_weight";
+    for (auto it = order.rbegin(); it != order.rend(); ++it) {
+        const auto& op = *it;
+        int64_t weight = 0;
+        if (ov::as_type_ptr<ov::op::v0::Result>(op)) {
+            op->get_rt_info()[weight_rt_info_name] = weight;
+        } else {
+            bool output_has_weight = false;
+            for (const auto& output : op->outputs()) {
+                for (const auto& input : output.get_target_inputs()) {
+                    const auto& output_op = input.get_node();
+                    const auto& rt_info = output_op->get_rt_info();
+                    if (!rt_info.count(weight_rt_info_name))
+                        continue;
+                    output_has_weight = true;
+                    auto output_weight = rt_info.at(weight_rt_info_name).as<int64_t>();
+                    weight = output_weight > weight ? output_weight : weight;
+                }
+            }
+            if (output_has_weight) {
+                if (op->get_rt_info()[op_depends_on_parameter].as<bool>()) {
+                    weight += 1;
+                }
+                op->get_rt_info()[weight_rt_info_name] = weight;
+            }
+        }
+    }
+    // step 3: make propagation for all the nodes:
+    // if weight is already assigned -- skip operation
+    // else operation weights is minimum of input indices
+    // if all operation inputs have no weights -- this op is isolated and this algorithm doesn't make sense,
+    // such cases are extremely rare and rather theoretical, to handle them we return original ov::Model op order
+    std::map<int64_t, std::vector<std::shared_ptr<ov::Node>>> level_to_vector;
+    for (const auto& op : order) {
+        if (!op->get_rt_info().count(weight_rt_info_name)) {
+            int64_t weight = std::numeric_limits<int64_t>::max();
+            for (const auto& input : op->input_values()) {
+                const auto& rt_info = input.get_node_shared_ptr()->get_rt_info();
+                if (!rt_info.count(weight_rt_info_name))
+                    continue;
+                auto input_weight = rt_info.at(weight_rt_info_name).as<int64_t>();
+                weight = input_weight < weight ? input_weight : weight;
+            }
+            if (weight != std::numeric_limits<int64_t>::max())
+                op->get_rt_info()[weight_rt_info_name] = weight;
+            else
+                return m->get_ordered_ops();
+        }
+        level_to_vector[op->get_rt_info().at(weight_rt_info_name).as<int64_t>()].push_back(op);
+    }
+    // finalization: descending order for levels and ops within level are ordered by get_ordered_ops
+    std::vector<std::shared_ptr<ov::Node>> result;
+    result.reserve(order.size());
+    for (auto it = level_to_vector.rbegin(); it != level_to_vector.rend(); ++it) {
+        const auto& item = *it;
+        result.insert(result.end(), item.second.begin(), item.second.end());
+        for (const auto& op : item.second) {
+            op->get_rt_info().erase(weight_rt_info_name);
+            op->get_rt_info().erase(op_depends_on_parameter);
+        }
+    }
+    return result;
+}
+
+void save_shape_sources(const std::shared_ptr<ov::Node>& op, STS_map& symbol_shape_source) {
+    if (!ov::is_type<ov::op::v0::ShapeOf>(op) && !ov::is_type<ov::op::v3::ShapeOf>(op))
+        return;
+    const auto& output = op->input_value(0);
     for (const auto& d : output.get_partial_shape()) {
         if (d.is_static())
             continue;
@@ -240,7 +335,7 @@ bool ov::pass::OptimizeSymbolsUsedAsValues::run_on_model(const std::shared_ptr<o
     RUN_ON_FUNCTION_SCOPE(OptimizeSymbolsUsedAsValues);
     STS_map symbol_shape_source;
     STS_map symbol_value_source;
-    for (const auto& op : m->get_ordered_ops()) {
+    for (const auto& op : topological_order(m)) {
         // Result has output port which has shared (during validate_and_infer_type) tensor with input port.
         // Transformations may replace input of Result. After replacement and before Result::validate_and_infer_type --
         // output tensor of Result may contain inaccurate shape / symbols due to the sharing with tensor which may be
@@ -252,10 +347,9 @@ bool ov::pass::OptimizeSymbolsUsedAsValues::run_on_model(const std::shared_ptr<o
         // LTS maps aren't shared with sub-graphs because inner graph can not access outer graph for label sources
         ov::op::util::process_subgraph(*this, op);
 
-        for (auto& output : op->outputs()) {
+        for (auto& output : op->outputs())
             optimize_value_usage(output, symbol_shape_source, symbol_value_source);
-            save_shape_sources(output, symbol_shape_source);
-        }
+        save_shape_sources(op, symbol_shape_source);
     }
     return true;
 }
diff --git a/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp b/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp
index 881d02b20d295a..eb108e4c6591ba 100644
--- a/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp
+++ b/src/common/transformations/tests/symbolic_transformations/label_optimization.cpp
@@ -75,22 +75,16 @@ TEST_F(TransformationTestsF, ApplySymbolEquivalence_Concat_Values) {
         auto input_2 = make_shared<v0::Parameter>(element::f32, PartialShape::dynamic(4));
         auto concat = make_shared<v0::Concat>(OutputVector{input_1, input_2}, -1);
 
-        auto shape_1 = make_shared<v3::ShapeOf>(input_1);
-        auto gather_1 = make_shared<v8::Gather>(shape_1,
-                                                v0::Constant::create(element::i64, {1}, {3}),
-                                                v0::Constant::create(element::i64, {}, {0}));
-
-        auto shape_2 = make_shared<v3::ShapeOf>(input_2);
-        auto gather_2 = make_shared<v8::Gather>(shape_2,
-                                                v0::Constant::create(element::i64, {1}, {3}),
-                                                v0::Constant::create(element::i64, {}, {0}));
-
-        auto sum = make_shared<v1::Add>(gather_1, gather_2);
+        auto shape = make_shared<v3::ShapeOf>(concat);
+        auto gather = make_shared<v8::Gather>(shape,
+                                              v0::Constant::create(element::i64, {1}, {-1}),
+                                              v0::Constant::create(element::i64, {}, {0}));
 
         auto reshape = make_shared<v1::Reshape>(
             concat,
-            make_shared<v0::Concat>(OutputVector{sum, v0::Constant::create(element::i64, {1}, {-1})}, 0),
+            make_shared<v0::Concat>(OutputVector{gather, v0::Constant::create(element::i64, {1}, {-1})}, 0),
             false);
+
         model_ref = make_shared<Model>(NodeVector{reshape}, ParameterVector{input_2, input_1});
     }
 }

From bb179c69d1c03aea68a9817742abc111ae72f3cd Mon Sep 17 00:00:00 2001
From: Tatiana Savina <tatiana.savina@intel.com>
Date: Fri, 7 Jun 2024 13:46:26 +0200
Subject: [PATCH 12/18] [DOCS] CPU perf hints doc review (#24827)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 .../assets/snippets/multi_threading.py        |   2 +-
 .../cpu-device.rst                            |  10 +-
 ...erformance-hint-and-threads-scheduling.rst | 113 ++++++++++--------
 3 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/docs/articles_en/assets/snippets/multi_threading.py b/docs/articles_en/assets/snippets/multi_threading.py
index 9a5baa1e7575b1..6994b26a0d6552 100644
--- a/docs/articles_en/assets/snippets/multi_threading.py
+++ b/docs/articles_en/assets/snippets/multi_threading.py
@@ -37,7 +37,7 @@
 # ! [ov:intel_cpu:multi_threading:part0]
 
 # ! [ov:intel_cpu:multi_threading:part1]
-# Disable CPU threads pinning for inference when system supoprt it
+# Disable CPU threads pinning for inference when the system supports it
 compiled_model_4 = core.compile_model(
     model=model,
     device_name=device_name,
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
index b45ff8140031e6..d95f97959f5b2a 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
@@ -3,7 +3,11 @@
 CPU Device
 ==========
 
+.. toctree::
+   :maxdepth: 1
+   :hidden:
 
+   cpu-device/performance-hint-and-threads-scheduling
 
 .. meta::
    :description: The CPU plugin in the Intel® Distribution of OpenVINO™ toolkit
@@ -246,12 +250,6 @@ For more details, see the :doc:`optimization guide <../optimize-inference>` and
    on data transfer between NUMA nodes. In that case it is better to use the ``ov::hint::PerformanceMode::LATENCY`` performance hint.
    For more details see the :doc:`performance hints <../optimize-inference/high-level-performance-hints>` overview.
 
- .. toctree::
-    :maxdepth: 1
-    :hidden:
- 
-    cpu-device/performance-hint-and-threads-scheduling
-
 Dynamic Shapes
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst
index 93c8c0bd6b36c7..3087bcf2d95783 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device/performance-hint-and-threads-scheduling.rst
@@ -1,6 +1,5 @@
-.. {#openvino_docs_OV_UG_supported_plugins_CPU_Hints_Threading}
 
-Performance Hints and Threads Scheduling 
+Performance Hints and Threads Scheduling
 ========================================
 
 .. meta::
@@ -8,37 +7,46 @@ Performance Hints and Threads Scheduling
                  detects CPU architecture and sets low-level properties based
                  on performance hints automatically.
 
-While all supported devices in OpenVINO offer low-level performance settings, it is advisable not to widely use these settings unless targeting specific platforms and models. The recommended approach is configuring performance in OpenVINO Runtime using high-level performance hints property ``ov::hint::performance_mode``. Performance hints ensure optimal portability and scalability of the applications across various platforms and models.
-
-To simplify the configuration of hardware devices, OpenVINO offers two performance hints: the latency hint ``ov::hint::PerformanceMode::LATENCY`` and the throughput hint ``ov::hint::PerformanceMode::THROUGHPUT``.
-
-- ``ov::inference_num_threads`` limits number of logical processors used for CPU inference.
-  If the number set by the user is greater than the number of logical processors on the platform, multi-threading scheduler only uses the platform number for CPU inference.
-- ``ov::num_streams`` limits number of infer requests that can be run in parallel.
-  If the number set by the user is greater than the number of inference threads, multi-threading scheduler only uses the number of inference threads to ensure that there is at least one thread per stream.
-- ``ov::hint::scheduling_core_type`` limits the type of CPU cores for CPU inference when user runs inference on a hybird platform that includes both Performance-cores (P-cores) with Efficient-cores (E-cores).
-  If user platform only has one type of CPU cores, this property has no effect, and CPU inference always uses this unique core type.
-- ``ov::hint::enable_hyper_threading`` limits the use of one or two logical processors per CPU core when platform has CPU hyperthreading enabled.
+While all supported devices in OpenVINO offer low-level performance settings, it is advisable
+not to use these settings widely unless targeting specific platforms and models. The recommended
+approach is to configure performance in OpenVINO Runtime using the high-level performance hints
+property ``ov::hint::performance_mode``. Performance hints ensure optimal portability and
+scalability of applications across various platforms and models.
+
+To simplify the configuration of hardware devices, OpenVINO offers two performance hints: the
+latency hint ``ov::hint::PerformanceMode::LATENCY`` and the throughput hint
+``ov::hint::PerformanceMode::THROUGHPUT``.
+
+- ``ov::inference_num_threads`` limits the number of logical processors used for CPU inference.
+  If the number set by the user is greater than the number of logical processors on the platform,
+  the multi-threading scheduler only uses the platform number for CPU inference.
+- ``ov::num_streams`` limits the number of infer requests that can be run in parallel.
+  If the number set by the user is greater than the number of inference threads, multi-threading
+  scheduler only uses the number of inference threads to ensure that there is at least one thread per stream.
+- ``ov::hint::scheduling_core_type`` specifies the type of CPU cores for CPU inference when the user runs
+  inference on a hybird platform that includes both Performance-cores (P-cores) and Efficient-cores (E-cores).
+  If the user platform only has one type of CPU core, this property has no effect, and CPU inference always uses this unique core type.
+- ``ov::hint::enable_hyper_threading`` limits the use of one or two logical processors per CPU
+  core when the platform has CPU hyperthreading enabled.
   If there is only one logical processor per CPU core, such as Efficient-cores, this property has no effect, and CPU inference uses all logical processors.
-- ``ov::hint::enable_cpu_pinning`` enable CPU pinning during CPU inference. 
-  If user enable this property but inference scenario cannot support it, this property will be disabled during model compilation. 
-
-For additional details on the above configurations, refer to:
+- ``ov::hint::enable_cpu_pinning`` enables CPU pinning during CPU inference.
+  If the user enables this property but the inference scenario does not support it, this property will be disabled during model compilation.
 
-- `Multi-stream Execution <https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.html#multi-stream-execution>`__
+For additional details on the above configurations, refer to `Multi-stream Execution <https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.html#multi-stream-execution>`__.
 
 Latency Hint
 ###################################
 
-In this scenario, the default setting of ``ov::hint::scheduling_core_type`` is determined by the model precision and the ratio of P-cores and E-cores.
+In this scenario, the default setting of ``ov::hint::scheduling_core_type`` is determined by
+the model precision and the ratio of P-cores and E-cores.
 
 .. note::
 
-    P-cores is short for Performance-cores and E-cores is for Efficient-cores. These are available after 12th Gen Intel® Core™ Processor. 
+    P-cores is short for Performance-cores and E-cores stands for Efficient-cores. These types of cores are available starting with the 12th Gen Intel® Core™ processors.
 
 .. _Core Type Table of Latency Hint:
 +----------------------------+---------------------+---------------------+
-|                            | INT8 model          | FP32 model          |
+|                            | INT8 Model          | FP32 Model          |
 +============================+=====================+=====================+
 | E-cores / P-cores < 2      | P-cores             | P-cores             |
 +----------------------------+---------------------+---------------------+
@@ -49,38 +57,39 @@ In this scenario, the default setting of ``ov::hint::scheduling_core_type`` is d
 
 .. note::
 
-   Both P-cores and E-cores may be used for any configuration starting from 14th Gen Intel® Core™ Processor on Windows.
+   Both P-cores and E-cores may be used for any configuration starting with 14th Gen Intel® Core™ processors on Windows.
 
-Then the default settings of low-level performance properties on Windows and Linux are as follows:
+Then the default settings for low-level performance properties on Windows and Linux are as follows:
 
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| Property                             | Windows                                                        | Linux                                                          |
-+======================================+================================================================+================================================================+
-| ``ov::num_streams``                  | 1                                                              | 1                                                              |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| ``ov::inference_num_threads``        | is equal to number of P-cores or P-cores+E-cores on one socket | is equal to number of P-cores or P-cores+E-cores on one socket |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| ``ov::hint::scheduling_core_type``   | `Core Type Table of Latency Hint`_                             | `Core Type Table of Latency Hint`_                             |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| ``ov::hint::enable_hyper_threading`` | No                                                             | No                                                             |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
-| ``ov::hint::enable_cpu_pinning``     | No / Not Supported                                             | Yes except using P-cores and E-cores together                  |
-+--------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| Property                             | Windows                                                                | Linux                                                              |
++======================================+========================================================================+====================================================================+
+| ``ov::num_streams``                  | 1                                                                      | 1                                                                  |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| ``ov::inference_num_threads``        | is equal to the number of P-cores or P-cores+E-cores on one socket     | is equal to the number of P-cores or P-cores+E-cores on one socket |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| ``ov::hint::scheduling_core_type``   | `Core Type Table of Latency Hint`_                                     | `Core Type Table of Latency Hint`_                                 |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| ``ov::hint::enable_hyper_threading`` | No                                                                     | No                                                                 |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
+| ``ov::hint::enable_cpu_pinning``     | No / Not Supported                                                     | Yes except using P-cores and E-cores together                      |
++--------------------------------------+------------------------------------------------------------------------+--------------------------------------------------------------------+
 
 .. note::
 
-    - ``ov::hint::scheduling_core_type`` might be adjusted for particular inferred model on particular platform based on internal heuristics to guarantee best performance.
+    - ``ov::hint::scheduling_core_type`` may be adjusted for a particular inferred model on a specific platform based on internal heuristics to guarantee optimal performance.
     - Both P-cores and E-cores are used for the Latency Hint on Intel® Core™ Ultra Processors on Windows, except in the case of large language models.
-    - In case hyper-threading is enabled, two logical processors share hardware resource of one CPU core. OpenVINO do not expect to use both logical processors in one stream for one infer request. So ``ov::hint::enable_hyper_threading`` is ``No`` in this scenario.
-    - ``ov::hint::enable_cpu_pinning`` is disabled by default on Windows/Mac, and enabled on Linux. Such default settings are aligned with typical workloads running in corresponding environment to guarantee better OOB performance.
+    - In case hyper-threading is enabled, two logical processors share the hardware resources of one CPU core. OpenVINO does not expect to use both logical processors in one stream for a single infer request. So ``ov::hint::enable_hyper_threading`` is set to ``No`` in this scenario.
+    - ``ov::hint::enable_cpu_pinning`` is disabled by default on Windows and macOS, and enabled on Linux. Such default settings are aligned with typical workloads running in the corresponding environments to guarantee better out-of-the-box (OOB) performance.
 
 Throughput Hint
 ######################################
 
-In this scenario, thread scheduling first evaluates the memory pressure of the model being inferred on the current platform, and determines the number of threads per stream, as shown below.
+In this scenario, thread scheduling first evaluates the memory pressure of the model being
+inferred on the current platform, and determines the number of threads per stream, as shown below.
 
 +-----------------+-----------------------+
-| Memory Pressure | Threads per stream    |
+| Memory Pressure | Threads per Stream    |
 +=================+=======================+
 | low             | 1 P-core or 2 E-cores |
 +-----------------+-----------------------+
@@ -89,12 +98,13 @@ In this scenario, thread scheduling first evaluates the memory pressure of the m
 | high            | 3 or 4 or 5           |
 +-----------------+-----------------------+
 
-Then the value of ``ov::num_streams`` is calculated as ``ov::inference_num_threads`` divided by the number of threads per stream. The default settings of low-level performance properties on Windows and Linux are as follows:
+Then the value of ``ov::num_streams`` is calculated by dividing ``ov::inference_num_threads``
+by the number of threads per stream. The default settings for low-level performance properties on Windows and Linux are as follows:
 
 +--------------------------------------+-------------------------------+-------------------------------+
 | Property                             | Windows                       | Linux                         |
 +======================================+===============================+===============================+
-| ``ov::num_streams``                  | Calculate as above            | Calculate as above            |
+| ``ov::num_streams``                  | Calculated as above           | Calculated as above           |
 +--------------------------------------+-------------------------------+-------------------------------+
 | ``ov::inference_num_threads``        | Number of P-cores and E-cores | Number of P-cores and E-cores |
 +--------------------------------------+-------------------------------+-------------------------------+
@@ -107,16 +117,17 @@ Then the value of ``ov::num_streams`` is calculated as ``ov::inference_num_threa
 
 .. note::
 
-    - By default, different core types are not mixed within single stream in this scenario. And cores from different numa nodes are not mixed within single stream.
+    - By default, different core types are not mixed within a single stream in this scenario. The cores from different NUMA nodes are not mixed within a single stream.
 
 Multi-Threading Optimization
 ##############################################
 
-User can use the following properties to limit available CPU resource for model inference. If the platform or operating system can support this behavior, OpenVINO Runtime will perform multi-threading scheduling based on limited available CPU.
+The following properties can be used to limit the available CPU resources for model inference.
+If the platform or operating system supports this behavior, the OpenVINO Runtime will perform multi-threading scheduling based on the limited available CPU.
 
-- ``ov::inference_num_threads`` 
-- ``ov::hint::scheduling_core_type`` 
-- ``ov::hint::enable_hyper_threading`` 
+- ``ov::inference_num_threads``
+- ``ov::hint::scheduling_core_type``
+- ``ov::hint::enable_hyper_threading``
 
 .. tab-set::
 
@@ -137,9 +148,11 @@ User can use the following properties to limit available CPU resource for model
 
 .. note::
 
-   ``ov::hint::scheduling_core_type`` and ``ov::hint::enable_hyper_threading`` only support Intel® x86-64 CPU on Linux and Windows in current release.
+   ``ov::hint::scheduling_core_type`` and ``ov::hint::enable_hyper_threading`` only support Intel® x86-64 CPU on Linux and Windows in the current release.
 
-In some use cases, OpenVINO Runtime will enable CPU threads pinning by default for better performance. User can also turn it on or off using property ``ov::hint::enable_cpu_pinning``. Disable threads pinning might be beneficial in complex applications with several workloads executed in parallel.
+In some use cases, OpenVINO Runtime will enable CPU thread pinning by default for better performance.
+Users can also turn this feature on or off using the property ``ov::hint::enable_cpu_pinning``.
+Disabling thread pinning may be beneficial in complex applications where several workloads are executed in parallel.
 
 .. tab-set::
 

From ba5c45a4ee2f9f6b8ad615cec037bff687a30b76 Mon Sep 17 00:00:00 2001
From: Anastasia Kuporosova <anastasia.kuporosova@intel.com>
Date: Fri, 7 Jun 2024 13:56:36 +0200
Subject: [PATCH 13/18] [PyOV] Fix hanging on infer request destruction
 (#24722)

### Details:
- Initial problem: `test_custom_op` hanged on destruction because it was
waiting for a thread which tried to acquire GIL.
- The second problem is that pybind11 doesn't allow to work with GIL
besides of current scope and it's impossible to release GIL for
destructors. https://github.com/pybind/pybind11/issues/1446
- Current solution allows to release GIL for InferRequest and all called
by chain destructors.

### Tickets:
 - CVS-141744
---
 .../src/pyopenvino/core/async_infer_queue.cpp | 22 ++---
 .../python/src/pyopenvino/core/common.cpp     |  8 +-
 .../src/pyopenvino/core/infer_request.cpp     | 80 ++++++++++---------
 .../src/pyopenvino/core/infer_request.hpp     | 15 +++-
 .../src/pyopenvino/frontend/extension.cpp     | 19 +++--
 .../python/src/pyopenvino/utils/utils.cpp     |  7 ++
 .../python/src/pyopenvino/utils/utils.hpp     |  2 +
 .../python/tests/test_graph/test_op.py        |  4 +-
 8 files changed, 95 insertions(+), 62 deletions(-)

diff --git a/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp b/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp
index 52ba997e6ac2c5..dbb6608b50f0b5 100644
--- a/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp
+++ b/src/bindings/python/src/pyopenvino/core/async_infer_queue.cpp
@@ -15,6 +15,7 @@
 
 #include "pyopenvino/core/common.hpp"
 #include "pyopenvino/core/infer_request.hpp"
+#include "pyopenvino/utils/utils.hpp"
 
 namespace py = pybind11;
 
@@ -64,7 +65,7 @@ class AsyncInferQueue {
         });
         size_t idle_handle = m_idle_handles.front();
         // wait for request to make sure it returned from callback
-        m_requests[idle_handle].m_request.wait();
+        m_requests[idle_handle].m_request->wait();
         if (m_errors.size() > 0)
             throw m_errors.front();
         return idle_handle;
@@ -75,7 +76,7 @@ class AsyncInferQueue {
         // release GIL to avoid deadlock on python callback
         py::gil_scoped_release release;
         for (auto&& request : m_requests) {
-            request.m_request.wait();
+            request.m_request->wait();
         }
         // acquire the mutex to access m_errors
         std::lock_guard<std::mutex> lock(m_mutex);
@@ -87,7 +88,7 @@ class AsyncInferQueue {
         for (size_t handle = 0; handle < m_requests.size(); handle++) {
             // auto end_time = m_requests[handle].m_end_time; // TODO: pass it bellow? like in InferRequestWrapper
 
-            m_requests[handle].m_request.set_callback([this, handle /* ... */](std::exception_ptr exception_ptr) {
+            m_requests[handle].m_request->set_callback([this, handle /* ... */](std::exception_ptr exception_ptr) {
                 *m_requests[handle].m_end_time = Time::now();
                 {
                     // acquire the mutex to access m_idle_handles
@@ -110,14 +111,17 @@ class AsyncInferQueue {
     }
 
     void set_custom_callbacks(py::function f_callback) {
+        // need to acquire GIL before py::function deletion
+        auto callback_sp = Common::utils::wrap_pyfunction(std::move(f_callback));
+
         for (size_t handle = 0; handle < m_requests.size(); handle++) {
-            m_requests[handle].m_request.set_callback([this, f_callback, handle](std::exception_ptr exception_ptr) {
+            m_requests[handle].m_request->set_callback([this, callback_sp, handle](std::exception_ptr exception_ptr) {
                 *m_requests[handle].m_end_time = Time::now();
                 if (exception_ptr == nullptr) {
                     // Acquire GIL, execute Python function
                     py::gil_scoped_acquire acquire;
                     try {
-                        f_callback(m_requests[handle], m_user_ids[handle]);
+                        (*callback_sp)(m_requests[handle], m_user_ids[handle]);
                     } catch (const py::error_already_set& py_error) {
                         // This should behave the same as assert(!PyErr_Occurred())
                         // since constructor for pybind11's error_already_set is
@@ -193,13 +197,13 @@ void regclass_AsyncInferQueue(py::module m) {
             // Set new inputs label/id from user
             self.m_user_ids[handle] = userdata;
             // Update inputs if there are any
-            self.m_requests[handle].m_request.set_input_tensor(inputs);
+            self.m_requests[handle].m_request->set_input_tensor(inputs);
             // Now GIL can be released - we are NOT working with Python objects in this block
             {
                 py::gil_scoped_release release;
                 *self.m_requests[handle].m_start_time = Time::now();
                 // Start InferRequest in asynchronus mode
-                self.m_requests[handle].m_request.start_async();
+                self.m_requests[handle].m_request->start_async();
             }
         },
         py::arg("inputs"),
@@ -239,13 +243,13 @@ void regclass_AsyncInferQueue(py::module m) {
             // Set new inputs label/id from user
             self.m_user_ids[handle] = userdata;
             // Update inputs if there are any
-            Common::set_request_tensors(self.m_requests[handle].m_request, inputs);
+            Common::set_request_tensors(*self.m_requests[handle].m_request, inputs);
             // Now GIL can be released - we are NOT working with Python objects in this block
             {
                 py::gil_scoped_release release;
                 *self.m_requests[handle].m_start_time = Time::now();
                 // Start InferRequest in asynchronus mode
-                self.m_requests[handle].m_request.start_async();
+                self.m_requests[handle].m_request->start_async();
             }
         },
         py::arg("inputs"),
diff --git a/src/bindings/python/src/pyopenvino/core/common.cpp b/src/bindings/python/src/pyopenvino/core/common.cpp
index 9f57b794e2bff6..179002127960cd 100644
--- a/src/bindings/python/src/pyopenvino/core/common.cpp
+++ b/src/bindings/python/src/pyopenvino/core/common.cpp
@@ -433,10 +433,14 @@ ov::op::v0::Constant create_shared(py::array& array) {
     // If ndim is equal to 0, creates scalar Constant.
     // If size is equal to 0, creates empty Constant.
     if (array_helpers::is_contiguous(array)) {
-        auto memory = std::make_shared<ov::SharedBuffer<py::array>>(
+        auto buffer = new ov::SharedBuffer<py::array>(
             static_cast<char*>((array.ndim() == 0 || array.size() == 0) ? array.mutable_data() : array.mutable_data(0)),
             array.ndim() == 0 ? array.itemsize() : array.nbytes(),
             array);
+        std::shared_ptr<ov::SharedBuffer<py::array>> memory(buffer, [](ov::SharedBuffer<py::array>* buffer) {
+            py::gil_scoped_acquire acquire;
+            delete buffer;
+        });
         return ov::op::v0::Constant(type_helpers::get_ov_type(array), array_helpers::get_shape(array), memory);
     }
     // If passed array is not C-style, throw an error.
@@ -614,7 +618,7 @@ uint32_t get_optimal_number_of_requests(const ov::CompiledModel& actual) {
 py::dict outputs_to_dict(InferRequestWrapper& request, bool share_outputs, bool decode_strings) {
     py::dict res;
     for (const auto& out : request.m_outputs) {
-        auto t = request.m_request.get_tensor(out);
+        auto t = request.m_request->get_tensor(out);
         if (t.get_element_type() == ov::element::string) {
             if (share_outputs) {
                 PyErr_WarnEx(PyExc_RuntimeWarning, "Result of a string type will be copied to OVDict!", 1);
diff --git a/src/bindings/python/src/pyopenvino/core/infer_request.cpp b/src/bindings/python/src/pyopenvino/core/infer_request.cpp
index 93a52b1dad681f..9f572d273dc5f3 100644
--- a/src/bindings/python/src/pyopenvino/core/infer_request.cpp
+++ b/src/bindings/python/src/pyopenvino/core/infer_request.cpp
@@ -18,7 +18,7 @@ inline py::object run_sync_infer(InferRequestWrapper& self, bool share_outputs,
     {
         py::gil_scoped_release release;
         *self.m_start_time = Time::now();
-        self.m_request.infer();
+        self.m_request->infer();
         *self.m_end_time = Time::now();
     }
     return Common::outputs_to_dict(self, share_outputs, decode_strings);
@@ -38,7 +38,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensors",
         [](InferRequestWrapper& self, const py::dict& inputs) {
-            Common::set_request_tensors(self.m_request, inputs);
+            Common::set_request_tensors(*self.m_request, inputs);
         },
         py::arg("inputs"),
         R"(
@@ -51,7 +51,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensors",
         [](InferRequestWrapper& self, const std::string& tensor_name, const std::vector<ov::Tensor>& tensors) {
-            self.m_request.set_tensors(tensor_name, tensors);
+            self.m_request->set_tensors(tensor_name, tensors);
         },
         py::arg("tensor_name"),
         py::arg("tensors"),
@@ -73,7 +73,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensors",
         [](InferRequestWrapper& self, const ov::Output<const ov::Node>& port, const std::vector<ov::Tensor>& tensors) {
-            self.m_request.set_tensors(port, tensors);
+            self.m_request->set_tensors(port, tensors);
         },
         py::arg("port"),
         py::arg("tensors"),
@@ -100,7 +100,7 @@ void regclass_InferRequest(py::module m) {
         [](InferRequestWrapper& self, const py::dict& outputs) {
             auto outputs_map = Common::containers::cast_to_tensor_index_map(outputs);
             for (auto&& output : outputs_map) {
-                self.m_request.set_output_tensor(output.first, output.second);
+                self.m_request->set_output_tensor(output.first, output.second);
             }
         },
         py::arg("outputs"),
@@ -117,7 +117,7 @@ void regclass_InferRequest(py::module m) {
         [](InferRequestWrapper& self, const py::dict& inputs) {
             auto inputs_map = Common::containers::cast_to_tensor_index_map(inputs);
             for (auto&& input : inputs_map) {
-                self.m_request.set_input_tensor(input.first, input.second);
+                self.m_request->set_input_tensor(input.first, input.second);
             }
         },
         py::arg("inputs"),
@@ -131,7 +131,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_input_tensors",
         [](InferRequestWrapper& self, const std::vector<ov::Tensor>& tensors) {
-            self.m_request.set_input_tensors(tensors);
+            self.m_request->set_input_tensors(tensors);
         },
         py::arg("tensors"),
         R"(
@@ -148,7 +148,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_input_tensors",
         [](InferRequestWrapper& self, size_t idx, const std::vector<ov::Tensor>& tensors) {
-            self.m_request.set_input_tensors(idx, tensors);
+            self.m_request->set_input_tensors(idx, tensors);
         },
         py::arg("idx"),
         py::arg("tensors"),
@@ -168,7 +168,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "infer",
         [](InferRequestWrapper& self, const ov::Tensor& inputs, bool share_outputs, bool decode_strings) {
-            self.m_request.set_input_tensor(inputs);
+            self.m_request->set_input_tensor(inputs);
             return run_sync_infer(self, share_outputs, decode_strings);
         },
         py::arg("inputs"),
@@ -197,7 +197,7 @@ void regclass_InferRequest(py::module m) {
         "infer",
         [](InferRequestWrapper& self, const py::dict& inputs, bool share_outputs, bool decode_strings) {
             // Update inputs if there are any
-            Common::set_request_tensors(self.m_request, inputs);
+            Common::set_request_tensors(*self.m_request, inputs);
             // Call Infer function
             return run_sync_infer(self, share_outputs, decode_strings);
         },
@@ -222,7 +222,7 @@ void regclass_InferRequest(py::module m) {
         "start_async",
         [](InferRequestWrapper& self, const ov::Tensor& inputs, py::object& userdata) {
             // Update inputs if there are any
-            self.m_request.set_input_tensor(inputs);
+            self.m_request->set_input_tensor(inputs);
             if (!userdata.is(py::none())) {
                 if (self.m_user_callback_defined) {
                     self.m_userdata = userdata;
@@ -232,7 +232,7 @@ void regclass_InferRequest(py::module m) {
             }
             py::gil_scoped_release release;
             *self.m_start_time = Time::now();
-            self.m_request.start_async();
+            self.m_request->start_async();
         },
         py::arg("inputs"),
         py::arg("userdata"),
@@ -261,7 +261,7 @@ void regclass_InferRequest(py::module m) {
         "start_async",
         [](InferRequestWrapper& self, const py::dict& inputs, py::object& userdata) {
             // Update inputs if there are any
-            Common::set_request_tensors(self.m_request, inputs);
+            Common::set_request_tensors(*self.m_request, inputs);
             if (!userdata.is(py::none())) {
                 if (self.m_user_callback_defined) {
                     self.m_userdata = userdata;
@@ -271,7 +271,7 @@ void regclass_InferRequest(py::module m) {
             }
             py::gil_scoped_release release;
             *self.m_start_time = Time::now();
-            self.m_request.start_async();
+            self.m_request->start_async();
         },
         py::arg("inputs"),
         py::arg("userdata"),
@@ -293,7 +293,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "cancel",
         [](InferRequestWrapper& self) {
-            self.m_request.cancel();
+            self.m_request->cancel();
         },
         R"(
             Cancels inference request.
@@ -303,7 +303,7 @@ void regclass_InferRequest(py::module m) {
         "wait",
         [](InferRequestWrapper& self) {
             py::gil_scoped_release release;
-            self.m_request.wait();
+            self.m_request->wait();
         },
         R"(
             Waits for the result to become available. 
@@ -316,7 +316,7 @@ void regclass_InferRequest(py::module m) {
         "wait_for",
         [](InferRequestWrapper& self, const int timeout) {
             py::gil_scoped_release release;
-            return self.m_request.wait_for(std::chrono::milliseconds(timeout));
+            return self.m_request->wait_for(std::chrono::milliseconds(timeout));
         },
         py::arg("timeout"),
         R"(
@@ -337,7 +337,11 @@ void regclass_InferRequest(py::module m) {
         [](InferRequestWrapper& self, py::function callback, py::object& userdata) {
             self.m_userdata = userdata;
             self.m_user_callback_defined = true;
-            self.m_request.set_callback([&self, callback](std::exception_ptr exception_ptr) {
+
+            // need to acquire GIL before py::function deletion
+            auto callback_sp = Common::utils::wrap_pyfunction(std::move(callback));
+
+            self.m_request->set_callback([&self, callback_sp](std::exception_ptr exception_ptr) {
                 *self.m_end_time = Time::now();
                 try {
                     if (exception_ptr) {
@@ -348,7 +352,7 @@ void regclass_InferRequest(py::module m) {
                 }
                 // Acquire GIL, execute Python function
                 py::gil_scoped_acquire acquire;
-                callback(self.m_userdata);
+                (*callback_sp)(self.m_userdata);
             });
         },
         py::arg("callback"),
@@ -365,7 +369,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_tensor",
         [](InferRequestWrapper& self, const std::string& name) {
-            return self.m_request.get_tensor(name);
+            return self.m_request->get_tensor(name);
         },
         py::arg("name"),
         R"(
@@ -380,7 +384,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_tensor",
         [](InferRequestWrapper& self, const ov::Output<const ov::Node>& port) {
-            return self.m_request.get_tensor(port);
+            return self.m_request->get_tensor(port);
         },
         py::arg("port"),
         R"(
@@ -395,7 +399,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_tensor",
         [](InferRequestWrapper& self, const ov::Output<ov::Node>& port) {
-            return self.m_request.get_tensor(port);
+            return self.m_request->get_tensor(port);
         },
         py::arg("port"),
         R"(
@@ -410,7 +414,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_input_tensor",
         [](InferRequestWrapper& self, size_t idx) {
-            return self.m_request.get_input_tensor(idx);
+            return self.m_request->get_input_tensor(idx);
         },
         py::arg("index"),
         R"(
@@ -427,7 +431,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_input_tensor",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_input_tensor();
+            return self.m_request->get_input_tensor();
         },
         R"(
             Gets input tensor of InferRequest.
@@ -440,7 +444,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_output_tensor",
         [](InferRequestWrapper& self, size_t idx) {
-            return self.m_request.get_output_tensor(idx);
+            return self.m_request->get_output_tensor(idx);
         },
         py::arg("index"),
         R"(
@@ -456,7 +460,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_output_tensor",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_output_tensor();
+            return self.m_request->get_output_tensor();
         },
         R"(
             Gets output tensor of InferRequest.
@@ -469,7 +473,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensor",
         [](InferRequestWrapper& self, const std::string& name, const ov::Tensor& tensor) {
-            self.m_request.set_tensor(name, tensor);
+            self.m_request->set_tensor(name, tensor);
         },
         py::arg("name"),
         py::arg("tensor"),
@@ -486,7 +490,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensor",
         [](InferRequestWrapper& self, const ov::Output<const ov::Node>& port, const ov::Tensor& tensor) {
-            self.m_request.set_tensor(port, tensor);
+            self.m_request->set_tensor(port, tensor);
         },
         py::arg("port"),
         py::arg("tensor"),
@@ -503,7 +507,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_tensor",
         [](InferRequestWrapper& self, const ov::Output<ov::Node>& port, const ov::Tensor& tensor) {
-            self.m_request.set_tensor(port, tensor);
+            self.m_request->set_tensor(port, tensor);
         },
         py::arg("port"),
         py::arg("tensor"),
@@ -520,7 +524,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_input_tensor",
         [](InferRequestWrapper& self, size_t idx, const ov::Tensor& tensor) {
-            self.m_request.set_input_tensor(idx, tensor);
+            self.m_request->set_input_tensor(idx, tensor);
         },
         py::arg("index"),
         py::arg("tensor"),
@@ -538,7 +542,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_input_tensor",
         [](InferRequestWrapper& self, const ov::Tensor& tensor) {
-            self.m_request.set_input_tensor(tensor);
+            self.m_request->set_input_tensor(tensor);
         },
         py::arg("tensor"),
         R"(
@@ -553,7 +557,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_output_tensor",
         [](InferRequestWrapper& self, size_t idx, const ov::Tensor& tensor) {
-            self.m_request.set_output_tensor(idx, tensor);
+            self.m_request->set_output_tensor(idx, tensor);
         },
         py::arg("index"),
         py::arg("tensor"),
@@ -570,7 +574,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "set_output_tensor",
         [](InferRequestWrapper& self, const ov::Tensor& tensor) {
-            self.m_request.set_output_tensor(tensor);
+            self.m_request->set_output_tensor(tensor);
         },
         py::arg("tensor"),
         R"(
@@ -585,7 +589,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_profiling_info",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_profiling_info();
+            return self.m_request->get_profiling_info();
         },
         py::call_guard<py::gil_scoped_release>(),
         R"(
@@ -602,7 +606,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "query_state",
         [](InferRequestWrapper& self) {
-            return self.m_request.query_state();
+            return self.m_request->query_state();
         },
         py::call_guard<py::gil_scoped_release>(),
         R"(
@@ -617,7 +621,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "reset_state",
         [](InferRequestWrapper& self) {
-            return self.m_request.reset_state();
+            return self.m_request->reset_state();
         },
         R"(
             Resets all internal variable states for relevant infer request to
@@ -627,7 +631,7 @@ void regclass_InferRequest(py::module m) {
     cls.def(
         "get_compiled_model",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_compiled_model();
+            return self.m_request->get_compiled_model();
         },
         R"(
             Returns the compiled model.
@@ -700,7 +704,7 @@ void regclass_InferRequest(py::module m) {
     cls.def_property_readonly(
         "profiling_info",
         [](InferRequestWrapper& self) {
-            return self.m_request.get_profiling_info();
+            return self.m_request->get_profiling_info();
         },
         py::call_guard<py::gil_scoped_release>(),
         R"(
diff --git a/src/bindings/python/src/pyopenvino/core/infer_request.hpp b/src/bindings/python/src/pyopenvino/core/infer_request.hpp
index 69f0412a1745c9..719d0374af6ff3 100644
--- a/src/bindings/python/src/pyopenvino/core/infer_request.hpp
+++ b/src/bindings/python/src/pyopenvino/core/infer_request.hpp
@@ -32,7 +32,7 @@ class InferRequestWrapper {
                         const std::vector<ov::Output<const ov::Node>>& outputs,
                         bool set_default_callback = true,
                         py::object userdata = py::none())
-        : m_request{std::move(request)},
+        : m_request{InferRequestWrapper::wrap_infer_request_to_sp(std::move(request))},
           m_inputs{inputs},
           m_outputs{outputs},
           m_userdata{userdata} {
@@ -44,7 +44,7 @@ class InferRequestWrapper {
             // Bump reference counter
             auto end_time = m_end_time;
             // Set standard callback which saves "end-time" for inference call
-            m_request.set_callback([end_time](std::exception_ptr exception_ptr) {
+            m_request->set_callback([end_time](std::exception_ptr exception_ptr) {
                 *end_time = Time::now();
                 try {
                     if (exception_ptr) {
@@ -73,7 +73,7 @@ class InferRequestWrapper {
     }
 
     // Original ov::InferRequest class that is held by this wrapper
-    ov::InferRequest m_request;
+    std::shared_ptr<ov::InferRequest> m_request;
     // Inputs and Outputs inherrited from ov::CompiledModel
     std::vector<ov::Output<const ov::Node>> m_inputs;
     std::vector<ov::Output<const ov::Node>> m_outputs;
@@ -91,11 +91,18 @@ class InferRequestWrapper {
         tensors.reserve(v.size());
 
         for (auto&& node : v) {
-            tensors.push_back(m_request.get_tensor(node));
+            tensors.push_back(m_request->get_tensor(node));
         }
 
         return tensors;
     }
+
+    static std::shared_ptr<ov::InferRequest> wrap_infer_request_to_sp(ov::InferRequest request) {
+        return std::shared_ptr<ov::InferRequest>(new ov::InferRequest(std::move(request)), [](ov::InferRequest* request) {
+                py::gil_scoped_release release;
+                delete request;
+        });
+    }
 };
 
 void regclass_InferRequest(py::module m);
diff --git a/src/bindings/python/src/pyopenvino/frontend/extension.cpp b/src/bindings/python/src/pyopenvino/frontend/extension.cpp
index a4f2e9cae1ca0c..4446ea2c9acc33 100644
--- a/src/bindings/python/src/pyopenvino/frontend/extension.cpp
+++ b/src/bindings/python/src/pyopenvino/frontend/extension.cpp
@@ -30,19 +30,26 @@ void regclass_frontend_TelemetryExtension(py::module m) {
                         py::function& send_event,
                         py::function& send_error,
                         py::function& send_stack_trace) {
+        auto send_event_sp = Common::utils::wrap_pyfunction(send_event);
+        auto send_error_sp = Common::utils::wrap_pyfunction(send_error);
+        auto send_stack_trace_sp = Common::utils::wrap_pyfunction(send_stack_trace);
+
         return std::make_shared<TelemetryExtension>(
             event_category,
-            [send_event](const std::string& category, const std::string& action, const std::string& label, int value) {
+            [send_event_sp](const std::string& category,
+                            const std::string& action,
+                            const std::string& label,
+                            int value) {
                 py::gil_scoped_acquire acquire;
-                send_event(category, action, label, value);
+                (*send_event_sp)(category, action, label, value);
             },
-            [send_error](const std::string& category, const std::string& error_message) {
+            [send_error_sp](const std::string& category, const std::string& error_message) {
                 py::gil_scoped_acquire acquire;
-                send_error(category, error_message);
+                (*send_error_sp)(category, error_message);
             },
-            [send_stack_trace](const std::string& category, const std::string& error_message) {
+            [send_stack_trace_sp](const std::string& category, const std::string& error_message) {
                 py::gil_scoped_acquire acquire;
-                send_stack_trace(category, error_message);
+                (*send_stack_trace_sp)(category, error_message);
             });
     }));
 
diff --git a/src/bindings/python/src/pyopenvino/utils/utils.cpp b/src/bindings/python/src/pyopenvino/utils/utils.cpp
index 27f015b14272c2..feeac2d7a02a73 100644
--- a/src/bindings/python/src/pyopenvino/utils/utils.cpp
+++ b/src/bindings/python/src/pyopenvino/utils/utils.cpp
@@ -419,5 +419,12 @@ ov::Any py_object_to_any(const py::object& py_obj) {
     }
     OPENVINO_ASSERT(false, "Unsupported attribute type.");
 }
+std::shared_ptr<py::function> wrap_pyfunction(py::function f_callback) {
+    auto callback_sp = std::shared_ptr<py::function>(new py::function(std::move(f_callback)), [](py::function* c) {
+        py::gil_scoped_acquire acquire;
+        delete c;
+    });
+    return callback_sp;
+}
 };  // namespace utils
 };  // namespace Common
diff --git a/src/bindings/python/src/pyopenvino/utils/utils.hpp b/src/bindings/python/src/pyopenvino/utils/utils.hpp
index 1e0e7f23069d2e..e4048b3f52feb3 100644
--- a/src/bindings/python/src/pyopenvino/utils/utils.hpp
+++ b/src/bindings/python/src/pyopenvino/utils/utils.hpp
@@ -58,5 +58,7 @@ namespace utils {
 
     ov::pass::Serialize::Version convert_to_version(const std::string& version);
 
+    std::shared_ptr<py::function> wrap_pyfunction(py::function f_callback);
+
 }; // namespace utils
 }; // namespace Common
diff --git a/src/bindings/python/tests/test_graph/test_op.py b/src/bindings/python/tests/test_graph/test_op.py
index 2bd609ef5278f1..5a8abdc55ea86c 100644
--- a/src/bindings/python/tests/test_graph/test_op.py
+++ b/src/bindings/python/tests/test_graph/test_op.py
@@ -107,9 +107,7 @@ def test_custom_add_model():
 
 def test_custom_op():
     model = create_snake_model()
-    # todo: CVS-141744
-    # it hangs with AUTO plugin, but works well with CPU
-    compiled_model = compile_model(model, "CPU")
+    compiled_model = compile_model(model)
 
     assert isinstance(compiled_model, CompiledModel)
     request = compiled_model.create_infer_request()

From 2893f2fc7c67a26d3e550ed09a0bb0cbe10ce329 Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Fri, 7 Jun 2024 15:13:26 +0300
Subject: [PATCH 14/18]  [Snippets] Added dynamism support to MHA Tokenization
 pass (#24897)

### Details:
- *Updated `MHATokenization` pass to support dynamic nodes tokenization*
 - *Added MHATokenization unit tests*
 - *Disabled dynamic MHA tokenization in CPU Plugin*


### Tickets:
 - *123329*
---
 .../snippets/src/pass/mha_tokenization.cpp    |  85 +++++---
 .../tests/src/pass/mha_tokenization.cpp       |  70 ++++++-
 .../transformation_pipeline.cpp               |   2 +-
 .../include/subgraph_mha.hpp                  |  18 +-
 .../ov_snippets_models/src/subgraph_mha.cpp   | 196 +++++++++---------
 5 files changed, 227 insertions(+), 144 deletions(-)

diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp
index 08deb95b12ec22..d928cdd1d33eba 100644
--- a/src/common/snippets/src/pass/mha_tokenization.cpp
+++ b/src/common/snippets/src/pass/mha_tokenization.cpp
@@ -16,7 +16,7 @@
 
 namespace {
 bool is_supported_tensor(const ov::descriptor::Tensor& t) {
-    return t.get_partial_shape().is_static() && ov::snippets::utils::one_of(t.get_shape().size(), 3lu, 4lu);
+    return t.get_partial_shape().rank().is_static() && ov::snippets::utils::one_of(t.get_partial_shape().size(), 3lu, 4lu);
 }
 
 bool is_supported_intermediate_op(const std::shared_ptr<ov::Node>& node) {
@@ -68,6 +68,10 @@ void tokenize_broadcast(const std::shared_ptr<ov::Node>& interm_op, ov::NodeVect
         // TODO: Can we reuse AppropriateForSubgraph here? Seems like it's huge check for Broadcast
         if (broadcast && broadcast->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY &&
             broadcast->get_output_target_inputs(0).size() == 1) {
+            // TODO: Add support of Broadcast with ShapeOf subgraph on second input
+            if (!ov::is_type<ov::op::v0::Constant>(broadcast->input_value(1).get_node_shared_ptr()))
+                continue;
+
             broadcast_nodes.push_back(broadcast);
 
             const auto pshape = broadcast->get_input_partial_shape(0);
@@ -96,10 +100,17 @@ void tokenize_broadcast(const std::shared_ptr<ov::Node>& interm_op, ov::NodeVect
 bool tokenize_reshape_around_softmax(std::shared_ptr<ov::Node>& interm_op, std::shared_ptr<ov::opset1::Reshape>& reshape, ov::NodeVector& ordered_ops) {
     reshape = ov::as_type_ptr<ov::opset1::Reshape>(interm_op);
     if (reshape) {
-        const auto in_shape = reshape->get_input_shape(0);
-        const auto out_shape = reshape->get_output_shape(0);
-        if (in_shape.back() != out_shape.back() || reshape->get_output_target_inputs(0).size() != 1)
+        // TODO: Add support of Reshape with ShapeOf subgraph on second input
+        if (!ov::is_type<ov::op::v0::Constant>(reshape->input_value(1).get_node_shared_ptr()))
+            return false;
+
+        const auto in_shape = reshape->get_input_partial_shape(0);
+        const auto out_shape = reshape->get_output_partial_shape(0);
+        const auto in_last_dim = *in_shape.crbegin();
+        const auto out_last_dim = *out_shape.crbegin();
+        if (in_last_dim.is_dynamic() || out_last_dim.is_dynamic() || in_last_dim != out_last_dim || reshape->get_output_target_inputs(0).size() != 1)
             return false;
+
         ordered_ops.push_back(reshape);
         interm_op = reshape->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
     }
@@ -204,8 +215,7 @@ bool ov::snippets::pass::TokenizeMHASnippets::is_matmul0_supported(const std::sh
 ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsTokenization::Config& config) {
     MATCHER_SCOPE(TokenizeMHASnippets);
 
-    auto m_matmul0 = std::make_shared<ov::opset1::MatMul>(ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape()),
-                                                          ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape()));
+    auto m_matmul0 = std::make_shared<ov::opset1::MatMul>(ov::pass::pattern::any_input(), ov::pass::pattern::any_input());
 
     register_matcher(std::make_shared<ov::pass::pattern::Matcher>(m_matmul0, matcher_name),
         [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher &m) {
@@ -224,20 +234,14 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         // Example:
         //     Buffer - i32 [32, 128] -> ~ Loop ~ -> Buffer - i8 [32, 128]
         //     After each Loop iteration we should increment pointers of Buffers: accordingly on 4 byte and 1 byte for scalar case.
-        //     It means that these Buffers cannot be inplace => Each Buffer should have the own register
+        //     It means that these increments are not proportional => Each Buffer should have the own register
         // For that we can just check the following "branches":
         //  - Between MatMul0 and MatMul1 - Softmax is sync point. The operations between MatMul0 -> Softmax and Softmax -> MatMul1
         //                                  will be fused into one loop after conversion to snippet dialect (Because it's just FQ, Eltwise nodes)
-        //  - Between MatMul0 and Transpose1 - At the moment operations after Transpose1 cannot be fused in Transpose Loop (to avoid performance regressions).
+        //  - Between MatMul0 and Transpose1 - At the moment operations after Transpose1 cannot be fused in inner Transpose Loop
+        //                                     (to avoid performance regressions due to scalar calculations).
         //                                     But operations after Transpose1 and before MatMul0  will be fused into one loop as well (look at first point)
-        // Note: If the pass is updated, need to check the new possible branches for potential non-inplace Buffers!
-        // Default value is 2 because
-        //  - Firstly, Softmax always needs Buffers
-        //  - Secondly, Softmax needs 2 Buffers but they can be inplace - One virtual port is enough for Softmax => buffer_count = 1
-        //  - Thirdly, MatMul requires unique Buffers on inputs and outputs because blocking implementation increments input/output pointers during computations
-        //    However, all of the Buffers are usually reused by the next MatMul and Softmax.
-        //    So on sufficiently large subgraphs we use only one additional unique buffer => buffer_count increments by 1
-        size_t buffer_count = 2;
+        size_t uniqie_buffer_reg_group_count = 1;  // After MatMul0 there is always one Buffer
         std::string fused_names;
         ov::NodeVector ordered_ops;
 
@@ -260,24 +264,20 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         if (!is_matmul0_supported(matmul0))
             return false;
 
-        const auto matmul0_prc = op::Brgemm::get_output_type(matmul0->get_input_element_type(0), matmul0->get_input_element_type(1));
-        // Between MatMul0 and Softmax will be the one Loop because of LoopFusing optimization.
-        // The Loop will have one Buffer with the same shape both on input and output.
-        // Need to check for precision to get if we need one more register for Buffer
-        if (matmul0_prc.size() != ov::element::f32.size()) {
-            if (buffer_count < 2)
-                buffer_count++;
-        }
-
         ordered_ops.push_back(matmul0);
 
         const auto pattern_rank = matmul0->get_output_partial_shape(0).size();
 
+        const auto ops_count_before_softmax = ordered_ops.size();
         auto interm_op = matmul0->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
         // Add supported operations which are between MatMul0 and Softmax to ordered_ops
         if (!update_intermediate_supported_ops(interm_op, ordered_ops, hidden_virtual_ports_count, potential_body_params_count))
             return false;
 
+        // If before Softmax there is Eltwise ops, there will be one more Buffer
+        if (ops_count_before_softmax != ordered_ops.size() && interm_op->get_output_partial_shape(0).rbegin()->is_dynamic())
+            uniqie_buffer_reg_group_count++;
+
         std::shared_ptr<ov::opset1::Reshape> reshape0 = nullptr;
         if (!tokenize_reshape_around_softmax(interm_op, reshape0, ordered_ops))
             return false;
@@ -294,6 +294,11 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
 
         if (axis != rank.get_length() - 1 || interm_op->get_output_target_inputs(0).size() != 1)
             return false;
+
+        // Softmax need one buffer at least
+        if (interm_op->get_output_partial_shape(0).rbegin()->is_dynamic())
+            uniqie_buffer_reg_group_count++;
+
         ordered_ops.push_back(interm_op);
 
         interm_op = interm_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
@@ -302,7 +307,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
             return false;
 
         if (((reshape0 == nullptr) != (reshape1 == nullptr)) ||
-             (reshape0 && reshape1 && (reshape0->get_input_shape(0) != reshape1->get_output_shape(0))))
+             (reshape0 && reshape1 && (reshape0->get_input_partial_shape(0) != reshape1->get_output_partial_shape(0))))
             return false;
 
         // Add supported operations which are between Softmax and MatMul1 to ordered_ops
@@ -310,8 +315,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
             return false;
 
         const auto matmul1 = ov::as_type_ptr<ov::opset1::MatMul>(interm_op);
-        if (!matmul1 || matmul1->get_output_target_inputs(0).size() != 1 ||
-            matmul1->get_transpose_a() || matmul1->get_transpose_b())
+        if (!matmul1 || matmul1->get_transpose_a() || matmul1->get_transpose_b())
             return false;
 
         const auto matmul1_out_type = op::Brgemm::get_output_type(matmul1->get_input_element_type(0),
@@ -328,8 +332,9 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         // Between Softmax and MatMul1 will be the one Loop because of LoopFusing optimization.
         // The Loop will have one Buffer with the same shape both on input and output.
         // Need to check for precision to get if we need one more register for Buffer
-        if (matmul1->get_input_element_type(0).size() != ov::element::f32.size()) {
-            buffer_count++;
+        const auto matmul0_prc = op::Brgemm::get_output_type(matmul0->get_input_element_type(0), matmul0->get_input_element_type(1));
+        if (matmul1->get_input_element_type(0).size() != matmul0_prc.size() || matmul1->get_input_partial_shape(0).is_dynamic()) {
+            uniqie_buffer_reg_group_count++;
         }
 
         /***********************/
@@ -358,6 +363,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
         // There is transformation ExplicitTransposeMatMulInputs that set supported order and transposed_b(false).
         // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching
         const auto is_transposed_b_0 = matmul0->get_transpose_b();
+        bool has_matmul0_has_ops_on_input = false;
         while (is_supported_intermediate_op(parent)) {
             // All supported ops have only one output port
             if (parent->get_output_target_inputs(0).size() != 1)
@@ -379,6 +385,11 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
             ordered_ops.insert(ordered_ops.begin(), parent);
             // [107731] To go always through 0-th port - is it safe?
             parent = parent->get_input_node_shared_ptr(0);
+            has_matmul0_has_ops_on_input = true;
+        }
+        // If there are ops on second input of MatMul0 -> there always will be unique Buffer
+        if (has_matmul0_has_ops_on_input) {
+            uniqie_buffer_reg_group_count++;
         }
 
         auto tokenize_transpose = [&](const std::shared_ptr<ov::opset1::Transpose>& transpose,
@@ -412,7 +423,9 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
 
         bool are_ops_after_matmul1 = false;
         auto child = matmul1->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
-        while (is_supported_intermediate_op(child)) {
+        const auto can_be_ops_after_matmul1_tokenized = matmul1->get_output_target_inputs(0).size() == 1;
+        bool has_matmul1_has_ops_on_output = false;
+        while (can_be_ops_after_matmul1_tokenized && is_supported_intermediate_op(child)) {
             are_ops_after_matmul1 = true;
             // All supported ops have only one output port
             if (child->get_output_target_inputs(0).size() != 1)
@@ -427,19 +440,23 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
 
             // TODO [75567]: move this plugin-specific constraint to the plugin callback
             //               We cannot collapse op to Subgraph if count of potential Parameter and Result count is higher 12
-            if (potential_body_params_count + child->get_output_target_inputs(0).size() + hidden_virtual_ports_count + buffer_count > 12) {
+            if (potential_body_params_count + child->get_output_target_inputs(0).size() + hidden_virtual_ports_count + uniqie_buffer_reg_group_count > 12) {
                 break;
             }
 
             ordered_ops.push_back(child);
             child = child->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+            has_matmul1_has_ops_on_output = true;
+        }
+        if (has_matmul1_has_ops_on_output) {
+            uniqie_buffer_reg_group_count++;
         }
 
         // At the moment Snippets don't support nodes between MatMul1 and Transpose3 due to Loop and strided calculations limitations
         //     MatMul1
         //  <Supported ops>
         //    Transpose3
-        if (!are_ops_after_matmul1) {
+        if (can_be_ops_after_matmul1_tokenized && !are_ops_after_matmul1) {
             auto transpose3 = config.get_mha_token_enable_transpose_on_output() ? ov::as_type_ptr<ov::opset1::Transpose>(child) : nullptr;
             if (is_valid_transpose(transpose3, config.get_mha_supported_transpose_ranks(), get_fusion_transpose_order(pattern_rank)) &&
                 transpose3->get_input_element_type(0) == matmul1_out_type) {  // To avoid Convert between MatMul1 and Transpose3
@@ -455,7 +472,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
 
         // TODO [75567]: move this plugin-specific constraint to the plugin callback
         const auto last_node = ordered_ops.back();
-        if (potential_body_params_count + last_node->get_output_size() + hidden_virtual_ports_count + buffer_count > 11) {
+        if (potential_body_params_count + last_node->get_output_size() + hidden_virtual_ports_count + uniqie_buffer_reg_group_count > 11) {
             return false;
         }
 
diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
index 6438dff516cded..b411aace066203 100644
--- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp
+++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
@@ -39,6 +39,30 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D) {
     run();
 }
 
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D_Dynamic) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}},
+                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D_Dynamic_M) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{1, -1, 12, 64}, {1, 128, 12, 64}, {1, 12, -1, 128}, {1, 128, 12, 64}},
+                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D_Dynamic_K) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{1, 128, 12, -1}, {1, 128, 12, -1}, {1, 12, 128, 128}, {1, 128, 12, 64}},
+                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D) {
     const auto &f = MHAFunction(std::vector<PartialShape>{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
                                 std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}));
@@ -47,8 +71,15 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D) {
     run();
 }
 
-TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_MatMul0_Transpose) {
-    GTEST_SKIP();
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D_Dynamic) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}},
+                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), true, false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_with_MatMul0_Transpose) {
     const auto &f = MHAMatMul0TransposeFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}},
                                                 std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}));
     model = f.getOriginal();
@@ -56,6 +87,16 @@ TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_M
     run();
 }
 
+TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_with_MatMul0_Transpose_Dynamic) {
+    GTEST_SKIP();
+    const auto &f = MHAMatMul0TransposeFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}},
+                                                std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
+                                                false);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_int_Matmuls) {
     GTEST_SKIP();
     const auto &f = MHAINT8MatMulTypeRelaxedFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}});
@@ -71,6 +112,14 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction) {
     run();
 }
 
+TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_Dynamic_Transpose_extraction) {
+    GTEST_SKIP();
+    const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, true);
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction_and_unsupported_existing_transpose) {
     const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 12, 64, 128}, {1, 128, 12, 64}}, true,
                                                std::vector<int64_t>{0, 3, 1, 2});
@@ -79,6 +128,15 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction_and_uns
     run();
 }
 
+TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_Dynamic_Transpose_extraction_and_unsupported_existing_transpose) {
+    GTEST_SKIP();
+    const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, true,
+                                               std::vector<int64_t>{0, 3, 1, 2});
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_fusion) {
     const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 64, 128, 12}, {1, 128, 12, 64}}, false,
                                                std::vector<int64_t>{0, 2, 1, 3});
@@ -87,6 +145,14 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_fusion) {
     run();
 }
 
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Dyanmic_Transpose_fusion) {
+    const auto& f = MHATransposedInputFunction(std::vector<PartialShape>{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, false,
+                                               std::vector<int64_t>{0, 2, 1, 3});
+    model = f.getOriginal();
+    model_ref = f.getReference();
+    run();
+}
+
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) {
     const auto& f = MHASplitMFunction(std::vector<PartialShape>{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
                                       std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 3d13cab76dbb23..006935a85e85de 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -867,7 +867,7 @@ void Transformations::MainSnippets(void) {
 #if defined(OPENVINO_ARCH_X86_64)
     auto is_supported_matmul = [this](const std::shared_ptr<const ov::Node>& n) {
         const auto matmul = ov::as_type_ptr<const ov::op::v0::MatMul>(n);
-        if (!matmul)
+        if (!matmul || matmul->is_dynamic())
             return false;
         const auto in_type0 = matmul->get_input_element_type(0);
         const auto in_type1 = matmul->get_input_element_type(1);
diff --git a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp
index 3e3cb995b70555..9ec265c9322f5c 100644
--- a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp
+++ b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp
@@ -42,8 +42,9 @@ namespace snippets {
  */
 class MHAFunction : public SnippetsFunctionBase {
 public:
-    explicit MHAFunction(const std::vector<PartialShape>& inputShapes, const std::vector<ov::element::Type>& precisions, bool with_mul = true)
-        : SnippetsFunctionBase(inputShapes), with_mul(with_mul), precisions(precisions) {
+    explicit MHAFunction(const std::vector<PartialShape>& inputShapes, const std::vector<ov::element::Type>& precisions,
+                         bool with_mul = true, bool with_reshape = true)
+        : SnippetsFunctionBase(inputShapes), with_mul(with_mul), with_reshape(with_reshape), precisions(precisions) {
         OPENVINO_ASSERT(input_shapes.size() == 4, "Got invalid number of input shapes");
         OPENVINO_ASSERT(precisions.size() == 4, "Got invalid number of input precisions");
     }
@@ -51,8 +52,9 @@ class MHAFunction : public SnippetsFunctionBase {
     std::shared_ptr<ov::Model> initOriginal() const override;
     std::shared_ptr<ov::Model> initReference() const override;
 
-    bool with_mul = true;
-    std::vector<ov::element::Type> precisions;
+    const bool with_mul = true;
+    const bool with_reshape = true;
+    const std::vector<ov::element::Type> precisions;
 };
 
 class MHASplitMFunction : public MHAFunction {
@@ -85,8 +87,9 @@ class MHASplitMFunction : public MHAFunction {
  */
 class MHAMatMul0TransposeFunction : public SnippetsFunctionBase {
 public:
-    explicit MHAMatMul0TransposeFunction(const std::vector<PartialShape>& inputShapes, const std::vector<ov::element::Type>& precisions)
-            : SnippetsFunctionBase(inputShapes), precisions(precisions) {
+    explicit MHAMatMul0TransposeFunction(const std::vector<PartialShape>& inputShapes, const std::vector<ov::element::Type>& precisions,
+                                         bool with_reshape = true)
+            : SnippetsFunctionBase(inputShapes), with_reshape(with_reshape), precisions(precisions) {
         OPENVINO_ASSERT(input_shapes.size() == 4, "Got invalid number of input shapes");
         OPENVINO_ASSERT(precisions.size() == 4, "Got invalid number of input precisions");
     }
@@ -94,7 +97,8 @@ class MHAMatMul0TransposeFunction : public SnippetsFunctionBase {
     std::shared_ptr<ov::Model> initOriginal() const override;
     std::shared_ptr<ov::Model> initReference() const override;
 
-    std::vector<ov::element::Type> precisions;
+    const bool with_reshape = true;
+    const std::vector<ov::element::Type> precisions;
 };
 
 /* Graph:
diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
index 3157c53fbb32de..f923a9a3aa168e 100644
--- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
+++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
@@ -70,26 +70,35 @@ std::shared_ptr<ov::Model> MHAFunction::initOriginal() const {
     std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
     if (with_mul) {
         ov::Shape shape(rank, 1);
-        shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
-        std::vector<float> mulConstData(ov::shape_size(shape));
+        if (transpose1->get_output_partial_shape(0).is_static()) {
+            shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
+        }
         const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
         matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
     }
     const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, matmul_parent1);
     const auto add = std::make_shared<ov::op::v1::Add>(matMul0, addParam);
 
-    const auto interm_shape = add->get_output_shape(0);
-    const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies<size_t>());
-    const auto reshape0ConstData = std::vector<int64_t>{ batch, -1 };
-    const auto reshape1ConstData = interm_shape;
-    const auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData);
-    const auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData);
+    auto softmax_out = add->output(0);
+    if (with_reshape) {
+        const auto interm_shape = add->get_output_shape(0);
+        const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies<size_t>());
+        const auto reshape0ConstData = std::vector<int64_t>{ batch, -1 };
+        const auto reshape1ConstData = interm_shape;
+        const auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData);
+        const auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData);
+
+        const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
+        const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
+        const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
+        softmax_out = reshape1->output(0);
+    } else {
+        const auto softMax = std::make_shared<ov::opset1::Softmax>(add, rank - 1);
+        softmax_out = softMax->output(0);
+    }
 
-    const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
-    const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
-    const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
-    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(reshape1, transpose2);
+    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(softmax_out, transpose2);
     const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
 
     ov::ResultVector results{std::make_shared<ov::opset1::Result>(transpose3)};
@@ -124,13 +133,19 @@ std::shared_ptr<ov::Model> MHAFunction::initReference() const {
     std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
     if (with_mul) {
         ov::Shape shape(rank, 1);
-        shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
-        std::vector<float> mulConstData(ov::shape_size(shape));
+        if (transpose1->get_output_partial_shape(0).is_static()) {
+            shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
+        }
         const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
-        const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], mulConst->get_shape());
-        matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
-        subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
-        subgraph_inputs = {data0, data1, mulConst, data2, data3};
+
+        if (ov::shape_size(shape) > 1) {
+            const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], mulConst->get_shape());
+            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
+            subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
+            subgraph_inputs = {data0, data1, mulConst, data2, data3};
+        } else {
+            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
+        }
     }
 
     const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, matmul_parent1);
@@ -182,16 +197,22 @@ std::shared_ptr<ov::Model> MHASplitMFunction::initReference() const {
     std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
     if (with_mul) {
         ov::Shape shape(rank - 1, 1);
-        shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4];
-        ov::Shape reshape_shape = shape;
-        reshape_shape.insert(reshape_shape.cbegin() + rank - 3, 1);
-        std::vector<float> mulConstData(ov::shape_size(shape));
+        if (transpose1->get_output_partial_shape(0).is_static()) {
+            shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4];
+        }
         const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
-        const auto reshape_mul = make_reshape(mulConst, reshape_shape);
-        const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], reshape_mul->get_shape());
-        matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
-        subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
-        subgraph_inputs = {reshape0, reshape1, reshape_mul, reshape2, reshape3};
+
+        if (ov::shape_size(shape) > 1) {
+            ov::Shape reshape_shape = shape;
+            reshape_shape.insert(reshape_shape.cbegin() + rank - 3, 1);
+            const auto mulReshape = make_reshape(mulConst, reshape_shape);
+            const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], mulReshape->get_shape());
+            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
+            subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
+            subgraph_inputs = {reshape0, reshape1, mulReshape, reshape2, reshape3};
+        } else {
+            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
+        }
     }
 
     const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, matmul_parent1);
@@ -217,47 +238,42 @@ std::shared_ptr<ov::Model> MHAMatMul0TransposeFunction::initOriginal() const {
     auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
     ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param};
 
-    std::vector<ov::Shape> constantShapes;
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
-    constantShapes.push_back(ov::Shape({2}));
-    constantShapes.push_back(ov::Shape({4}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-
-    const auto order = std::vector<int64_t>{0, 2, 1, 3};
-    auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[0], order);
-    auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[1], order);
-    auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[5], order);
-    auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[6], order);
-
-    std::vector<float> mulConstData(1);
-    auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
-
-    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
-                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
-                                              -1};
-    auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[3], reshape0ConstData);
+    const auto rank = input_shapes[0].size();
+    const auto fusion_order = get_fusion_order(rank);
 
-    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
-    auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[4], reshape1ConstData);
+    const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
 
-    float transA = false;
-    float transB = false;
     const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
     const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+
+    const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
     const auto mul = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
-    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, mul, transA, true);
+    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, mul, false, true);
     const auto add = std::make_shared<ov::op::v1::Add>(matMul0, addParam);
-    const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
-    const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
-    const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
+
+    auto softmax_out = add->output(0);
+    if (with_reshape) {
+        const auto interm_shape = add->get_output_shape(0);
+        const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies<size_t>());
+        const auto reshape0ConstData = std::vector<int64_t>{ batch, -1 };
+        const auto reshape1ConstData = interm_shape;
+        const auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData);
+        const auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData);
+
+        const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
+        const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
+        const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
+        softmax_out = reshape1->output(0);
+    } else {
+        const auto softMax = std::make_shared<ov::opset1::Softmax>(add, rank - 1);
+        softmax_out = softMax->output(0);
+    }
+
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
-    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(reshape1, transpose2, transA, transB);
+    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(softmax_out, transpose2);
     const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
 
     ov::ResultVector results{std::make_shared<ov::opset1::Result>(transpose3)};
@@ -269,58 +285,38 @@ std::shared_ptr<ov::Model> MHAMatMul0TransposeFunction::initReference() const {
     auto data2 = std::make_shared<ov::opset1::Parameter>(precisions[2], input_shapes[2]);
     auto data3 = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
     ov::ParameterVector ngraphParams = {data0, data1, data2, data3};
+    NodeVector subgraph_inputs = {data0, data1, data2, data3};
 
     auto transpose0Param = std::make_shared<ov::opset1::Parameter>(precisions[0], input_shapes[0]);
     auto transpose1Param = std::make_shared<ov::opset1::Parameter>(precisions[1], input_shapes[1]);
     auto addParam = std::make_shared<ov::opset1::Parameter>(precisions[2], input_shapes[2]);
     auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
 
-    std::vector<ov::Shape> constantShapes;
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
-    constantShapes.push_back(ov::Shape({2}));
-    constantShapes.push_back(ov::Shape({4}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
-
-    auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[0], std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[1], std::vector<int64_t>{0, 2, 3, 1});
-    auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[5], std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[6], std::vector<int64_t>{0, 2, 1, 3});
-
-    std::vector<float> mulConstData(1);
-    auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
-    ov::ParameterVector subgraphParams = {transpose0Param, transpose1Param, addParam, transpose2Param};
+    ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param};
 
-    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
-                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
-                                              -1};
-    auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[3], reshape0ConstData);
+    const auto rank = input_shapes[0].size();
+    const auto fusion_order = get_fusion_order(rank);
+    const auto decomposed_order = get_decomposed_order(rank);
 
-    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
-                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
-    auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, constantShapes[4], reshape1ConstData);
+    const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order);
+    const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
 
-    float transA = false;
-    float transB = false;
     const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
     const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+
+    const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
     const auto mul = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
-    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, mul, transA, transB);
+    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, mul);
     const auto add = std::make_shared<ov::op::v1::Add>(matMul0, addParam);
-    const auto reshape0 = std::make_shared<ov::opset1::Reshape>(add, reshape0Const, true);
-    const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
-    const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);
+    const auto softMax = std::make_shared<ov::opset1::Softmax>(add, rank - 1);
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
-    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(reshape1, transpose2, transA, transB);
+    const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(softMax, transpose2);
     const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
 
-    auto subgraph = std::make_shared<ov::snippets::op::Subgraph>(
-            NodeVector{data0, data1, data2, data3},
-            std::make_shared<ov::Model>(NodeVector{transpose3}, subgraphParams));
+    auto subgraph = std::make_shared<ov::snippets::op::Subgraph>(subgraph_inputs,
+            std::make_shared<ov::Model>(NodeVector{transpose3}, subgraph_params));
 
     return std::make_shared<ov::Model>(NodeVector{subgraph}, ngraphParams);
 }
@@ -982,9 +978,9 @@ std::shared_ptr<ov::Model> MHATransposedInputFunction::initReference() const {
         }
     }
 
-    const auto param0 = std::make_shared<ov::opset1::Parameter>(precision, data0->get_shape());
-    const auto param1 = std::make_shared<ov::opset1::Parameter>(precision, in1->get_shape());
-    const auto param2 = std::make_shared<ov::opset1::Parameter>(precision, data2->get_shape());
+    const auto param0 = std::make_shared<ov::opset1::Parameter>(precision, data0->get_output_partial_shape(0));
+    const auto param1 = std::make_shared<ov::opset1::Parameter>(precision, in1->get_output_partial_shape(0));
+    const auto param2 = std::make_shared<ov::opset1::Parameter>(precision, data2->get_output_partial_shape(0));
 
     std::shared_ptr<ov::Node> matmul0_in1 = param1;
     if (!m_order.empty() && is_supported) {

From d996280d5eddba942578129b28638e619d778a7f Mon Sep 17 00:00:00 2001
From: Gorokhov Dmitriy <dmitry.gorokhov@intel.com>
Date: Fri, 7 Jun 2024 16:53:38 +0400
Subject: [PATCH 15/18] [CPU] Cherry-pick IP optimizations from latest oneDNN
 master (#24654)

### Details:
- Downstream latest InnerProduct performance optimizations to oneDNN
fork
- Ticket:
[CVS-142995](https://jira.devtools.intel.com/browse/CVS-142995)
 - oneDNN fork PR: https://github.com/openvinotoolkit/oneDNN/pull/252
---
 src/plugins/intel_cpu/thirdparty/onednn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn
index b0cd612cd3a378..a320d02d6e733c 160000
--- a/src/plugins/intel_cpu/thirdparty/onednn
+++ b/src/plugins/intel_cpu/thirdparty/onednn
@@ -1 +1 @@
-Subproject commit b0cd612cd3a378fb2dd73a84efddfca1df2a22db
+Subproject commit a320d02d6e733c775724901675cbc8944391459d

From beb37eb4e14006c96e26fe85b5c5763863660486 Mon Sep 17 00:00:00 2001
From: Sergey Shlyapnikov <sergey.shlyapnikov@intel.com>
Date: Fri, 7 Jun 2024 17:04:12 +0400
Subject: [PATCH 16/18] [GPU] Move scaling from QK dot product to Q (#24896)

### Details:
- Move scaling from QK dot product to Q input for better SDPA accuracy
with high-value ranges

### Tickets:
 - [CVS-143256](https://jira.devtools.intel.com/browse/CVS-143256)
---
 .../intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl      | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
index 635aa4d796d3db..b1aaded5ad7780 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
@@ -202,8 +202,7 @@ KERNEL(sdpa_opt)(
                     #define QUERY_BLOCK_SIZE 1
 
                     INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset);
-
-                    query_local[query_local_offset] = val;
+                    query_local[query_local_offset] = val * scale_val;
                     query_local_offset += QUERY_STEP_LOCAL;
                     query_offset += query_pitch;
                 }
@@ -338,7 +337,6 @@ KERNEL(sdpa_opt)(
                     for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE)) {
                         // Read value from SLM and apply scale
                         qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len];
-                        qk_val[seq_idx] *= scale_val;
 
                         // Apply attention mask
 #if IS_CAUSAL

From 45147aefa02ab3246f158ee08be32ae2f27aa04a Mon Sep 17 00:00:00 2001
From: Alina Kladieva <alina.kladieva@intel.com>
Date: Fri, 7 Jun 2024 17:22:07 +0200
Subject: [PATCH 17/18] [GHA] dGPU tests pipeline (#24572)

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/workflows/job_gpu_tests.yml | 134 ++++++++++++++++++++++++++++
 .github/workflows/linux.yml         | 129 ++++++--------------------
 2 files changed, 161 insertions(+), 102 deletions(-)
 create mode 100644 .github/workflows/job_gpu_tests.yml

diff --git a/.github/workflows/job_gpu_tests.yml b/.github/workflows/job_gpu_tests.yml
new file mode 100644
index 00000000000000..7ba71afec09748
--- /dev/null
+++ b/.github/workflows/job_gpu_tests.yml
@@ -0,0 +1,134 @@
+name: GPU
+
+on:
+  workflow_call:
+    inputs:
+      test_type:
+        description: 'Type of tests to execute'
+        type: string
+        required: true
+      device:
+        description: 'Device name (igpu or dgpu)'
+        type: string
+        required: true
+      runner:
+        description: 'Runner labels by which the runner will be chosen. Example: [ "self-hosted", "igpu" ]'
+        type: string
+        required: true
+      container:
+        description: 'JSON to be converted to the value of the "container" configuration for the job'
+        type: string
+        required: false
+        default: '{"image": null}'
+
+jobs:
+  GPU:
+    timeout-minutes: 80
+    runs-on: ${{ fromJSON(inputs.runner) }}
+    container: ${{ fromJSON(inputs.container) }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
+      INSTALL_DIR: ${{ github.workspace }}/install
+      INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests
+      GTEST_PARALLEL_SCRIPT: ${{ github.workspace }}/gtest_parallel.py
+    steps:
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          name: 'openvino_package'
+          path: ${{ env.INSTALL_DIR }}
+
+      - name: Download OpenVINO tests package
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          name: 'openvino_tests'
+          path: ${{ env.INSTALL_TEST_DIR }}
+
+      # Needed as ${{ github.workspace }} is not working correctly when using Docker
+      - name: Setup Variables
+        run: |
+          echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
+          echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
+          echo "GTEST_PARALLEL_SCRIPT=$GITHUB_WORKSPACE/gtest_parallel.py" >> "$GITHUB_ENV"
+
+      - name: Extract OpenVINO packages
+        run: |
+          pushd $INSTALL_DIR
+            tar -xzf openvino_package.tar.gz -C $INSTALL_DIR
+          popd
+          pushd $INSTALL_TEST_DIR
+            tar -xzf openvino_tests.tar.gz -C $INSTALL_DIR
+          popd
+
+      - name: Install dependencies (Linux)
+        run: |
+          $INSTALL_DIR/install_dependencies/install_openvino_dependencies.sh -c=core -c=dev -c=gpu -y
+          
+          apt-get update && apt-get install -y wget software-properties-common ca-certificates gpg-agent tzdata clinfo
+        env:
+          DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
+          TZ: "Europe/London" # to prevent tzdata from waiting user input
+      - name: Setup Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Get gtest-parallel script
+        run: wget https://raw.githubusercontent.com/google/gtest-parallel/master/gtest_parallel.py
+
+      - name: Install compute runtime drivers
+        run: |
+          wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-core_1.0.15985.7_amd64.deb
+          wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-opencl_1.0.15985.7_amd64.deb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu-dbgsym_1.3.28454.6_amd64.ddeb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu_1.3.28454.6_amd64.deb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd-dbgsym_24.05.28454.6_amd64.ddeb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd_24.05.28454.6_amd64.deb
+          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/libigdgmm12_22.3.11_amd64.deb
+          dpkg -i *.deb
+
+      - name: Install media & display runtimes
+        if: ${{ inputs.device == 'dgpu' }}
+        run: |
+          apt-get update && apt-get install -y \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm11 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all
+
+      - name: Verify devices
+        run: clinfo
+
+      #
+      # Tests
+      #
+
+      - name: OpenVINO GPU ${{ inputs.test_type }} Tests
+        id: run_tests
+        run: |
+          source ${INSTALL_DIR}/setupvars.sh
+
+          TEST_RESULTS_DIR="${{ inputs.device }}_${{ inputs.test_type }}_tests"
+          echo "test_results_dir=$TEST_RESULTS_DIR" >> $GITHUB_OUTPUT
+
+          rm -rf ${INSTALL_TEST_DIR}/${TEST_RESULTS_DIR} && mkdir -p ${INSTALL_TEST_DIR}/${TEST_RESULTS_DIR}
+
+          test_filter=''
+          if [[ "${{ inputs.test_type }}" == "unit" ]]; then
+            # Ticket: 138018
+            test_filter='-*scatter_nd_update_gpu.dynamic_padded_output*:*border_gpu.basic_zero_input*:*bicubic_zeros_no_align_data1x1*:*bicubic_border_align_batches*:*bilinear_zeros_no_align_data1x1*:*non_zero_gpu.empty_input*:*mark_shape_of_subgraphs.concat_with_empty_tensor_inputs*:*concat_cpu_impl.dynamic_4d_f*:*border_gpu.basic_zero_input_dynamic*:*network_test.model_with_empty_input_is_not_dynamic*:*bicubic_zeros_align_data1x1*'
+          else
+            test_filter='*smoke*'
+          fi
+          python3 ${GTEST_PARALLEL_SCRIPT} ${INSTALL_TEST_DIR}/ov_gpu_${{ inputs.test_type }}_tests --dump_json_test_results=${INSTALL_TEST_DIR}/${TEST_RESULTS_DIR}/ov_gpu_${{ inputs.test_type }}_tests.json -- --report_unique_name --gtest_filter=$test_filter
+
+
+      - name: Upload Test Results
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
+        if: always()
+        with:
+          name: test-results-${{ inputs.test_type }}-${{ inputs.device }}
+          path: ${{ env.INSTALL_TEST_DIR }}/${{ steps.run_tests.outputs.test_results_dir }}
+          if-no-files-found: 'error'
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 462e30e44103b8..744e693b1cff51 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -652,119 +652,44 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
     if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS
 
-  GPU:
-    name: GPU Tests
+  iGPU:
+    name: iGPU Tests
     needs: [ Build, Smart_CI ]
-    if: fromJSON(needs.smart_ci.outputs.affected_components).GPU
-    timeout-minutes: 80
-    runs-on: [ self-hosted, gpu ]
+    uses: ./.github/workflows/job_gpu_tests.yml
     strategy:
       max-parallel: 2
       fail-fast: false
       matrix:
         TEST_TYPE: ['unit', 'func']
-    container:
-      image: ubuntu:20.04
-      options: --device /dev/dri:/dev/dri --group-add 109 --group-add 44
-      volumes:
-        - /dev/dri:/dev/dri
-    defaults:
-      run:
-        shell: bash
-    env:
-      DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
-      INSTALL_DIR: ${{ github.workspace }}/install
-      INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests
-      GTEST_PARALLEL_SCRIPT: ${{ github.workspace }}/gtest_parallel.py
-    steps:
-      - name: Download OpenVINO package
-        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
-        with:
-          name: 'openvino_package'
-          path: ${{ env.INSTALL_DIR }}
-
-      - name: Download OpenVINO tests package
-        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
-        with:
-          name: 'openvino_tests'
-          path: ${{ env.INSTALL_TEST_DIR }}
-
-      # Needed as ${{ github.workspace }} is not working correctly when using Docker
-      - name: Setup Variables
-        run: |
-          echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
-          echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
-          echo "GTEST_PARALLEL_SCRIPT=$GITHUB_WORKSPACE/gtest_parallel.py" >> "$GITHUB_ENV"
-
-      - name: Extract OpenVINO packages
-        run: |
-          pushd $INSTALL_DIR
-            tar -xzf openvino_package.tar.gz -C $INSTALL_DIR
-          popd
-          pushd $INSTALL_TEST_DIR
-            tar -xzf openvino_tests.tar.gz -C $INSTALL_DIR
-          popd
-
-      - name: Install dependencies (Linux)
-        run: |
-          $INSTALL_DIR/install_dependencies/install_openvino_dependencies.sh -c=core -c=dev -c=gpu -y
-
-          apt-get update && apt-get install -y wget software-properties-common ca-certificates gpg-agent tzdata
-        env:
-          DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
-          TZ: "Europe/London" # to prevent tzdata from waiting user input
-
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
-        with:
-          python-version: ${{ env.PYTHON_VERSION }}
-
-      - name: Get gtest-parallel script
-        run: wget https://raw.githubusercontent.com/google/gtest-parallel/master/gtest_parallel.py
-
-      - name: Install GPU Drivers
-        run: |
-          wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-core_1.0.15985.7_amd64.deb
-          wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15985.7/intel-igc-opencl_1.0.15985.7_amd64.deb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu-dbgsym_1.3.28454.6_amd64.ddeb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-level-zero-gpu_1.3.28454.6_amd64.deb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd-dbgsym_24.05.28454.6_amd64.ddeb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/intel-opencl-icd_24.05.28454.6_amd64.deb
-          wget https://github.com/intel/compute-runtime/releases/download/24.05.28454.6/libigdgmm12_22.3.11_amd64.deb
-          dpkg -i *.deb
-
-      #
-      # Tests
-      #
-
-      - name: OpenVINO GPU ${{ matrix.TEST_TYPE }} Tests
-        run: |
-          source ${INSTALL_DIR}/setupvars.sh
-
-          rm -rf ${INSTALL_TEST_DIR}/gpu_${{ matrix.TEST_TYPE }}_tests && mkdir -p ${INSTALL_TEST_DIR}/gpu_${{ matrix.TEST_TYPE }}_tests
-
-          test_filter=''
-          if [[ "${{ matrix.TEST_TYPE }}" == "unit" ]]; then
-            # Ticket: 138018
-            test_filter='-*scatter_nd_update_gpu.dynamic_padded_output*:*border_gpu.basic_zero_input*:*bicubic_zeros_no_align_data1x1*:*bicubic_border_align_batches*:*bilinear_zeros_no_align_data1x1*:*non_zero_gpu.empty_input*:*mark_shape_of_subgraphs.concat_with_empty_tensor_inputs*:*concat_cpu_impl.dynamic_4d_f*:*border_gpu.basic_zero_input_dynamic*:*network_test.model_with_empty_input_is_not_dynamic*:*bicubic_zeros_align_data1x1*'
-          else
-            test_filter='*smoke*'
-          fi
-          python3 ${GTEST_PARALLEL_SCRIPT} ${INSTALL_TEST_DIR}/ov_gpu_${{ matrix.TEST_TYPE }}_tests --dump_json_test_results=${INSTALL_TEST_DIR}/gpu_${{ matrix.TEST_TYPE }}_tests/ov_gpu_${{ matrix.TEST_TYPE }}_tests.json -- --report_unique_name --gtest_filter=$test_filter
-
+    with:
+      device: 'igpu'
+      test_type: ${{ matrix.TEST_TYPE }}
+      runner: "[ 'self-hosted', 'igpu' ]"
+      container: '{"image": "ubuntu:20.04", "volumes": ["/dev/dri:/dev/dri"], "options": "--group-add 109 --group-add 44
+        --device /dev/dri:/dev/dri"}'
+    if: fromJSON(needs.smart_ci.outputs.affected_components).GPU
 
-      - name: Upload Test Results
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
-        if: always()
-        with:
-          name: test-results-${{ matrix.TEST_TYPE }}-gpu
-          path: ${{ env.INSTALL_TEST_DIR }}/gpu_${{ matrix.TEST_TYPE }}_tests
-          if-no-files-found: 'error'
+  dGPU:
+    name: dGPU Tests
+    needs: [ Build, Smart_CI ]
+    uses: ./.github/workflows/job_gpu_tests.yml
+    strategy:
+      max-parallel: 2
+      fail-fast: false
+      matrix:
+        TEST_TYPE: ['unit', 'func']
+    with:
+      device: 'dgpu'
+      test_type: ${{ matrix.TEST_TYPE }}
+      runner: "[ 'self-hosted', 'dgpu' ]"
+      container: '{"image": "ubuntu:20.04", "volumes": ["/dev/dri:/dev/dri"], "options": "--group-add 109 --group-add 44
+        --device /dev/dri/card0:/dev/dri/card0  --device /dev/dri/renderD128:/dev/dri/renderD128"}'
+    if: ${{ github.event_name == 'schedule' }}
 
   Overall_Status:
     name: ci/gha_overall_status
     needs: [Smart_CI, Build, Debian_Packages, Samples, Conformance, ONNX_Runtime, CXX_Unit_Tests, Python_Unit_Tests, TensorFlow_Layer_Tests,
-            CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers, GPU]
+            CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers, iGPU]
     if: ${{ always() }}
     runs-on: ubuntu-latest
     steps:

From f6e6f2af569433f0836125575735a9762e371f42 Mon Sep 17 00:00:00 2001
From: Nesterov Alexander <alexander.nesterov@intel.com>
Date: Fri, 7 Jun 2024 19:32:26 +0200
Subject: [PATCH 18/18] [CPU][ARM] Support deconvolution to correctly handle
 multiple output edges on a single output port (#24754)

---
 src/plugins/intel_cpu/src/nodes/deconv.cpp    |  4 +-
 .../src/arm/deconv_multiple_output_edges.cpp  | 70 +++++++++++++++++++
 2 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp

diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp
index 853dfc20d11299..d3f1ae0ba691a5 100644
--- a/src/plugins/intel_cpu/src/nodes/deconv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp
@@ -495,7 +495,7 @@ void Deconvolution::getSupportedDescriptors() {
                     creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i)));
         }
 
-        for (size_t i = 0; i < getChildEdges().size(); ++i) {
+        for (size_t i = 0; i < config.outConfs.size(); ++i) {
             config.outConfs[i].setMemDesc(
                     creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i)));
         }
@@ -1145,7 +1145,7 @@ void Deconvolution::initSupportedPrimitiveDescriptors() {
                     creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i)));
         }
 
-        for (size_t i = 0; i < getChildEdges().size(); ++i) {
+        for (size_t i = 0; i < config.outConfs.size(); ++i) {
             config.outConfs[i].setMemDesc(
                     creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i)));
         }
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp
new file mode 100644
index 00000000000000..b2cb4785fb5720
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/deconv_multiple_output_edges.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/node_builders/constant.hpp"
+#include "common_test_utils/node_builders/eltwise.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "utils/cpu_test_utils.hpp"
+#include "common_test_utils/node_builders/convolution_backprop_data.hpp"
+
+using namespace CPUTestUtils;
+
+namespace ov {
+namespace test {
+
+// Subgraph:
+/*
+┌──────────────────┐           ┌──────────────────┐
+│      INPUT       │           │      WEIGHTS     │
+└─────────┬────────┘           └─────────┬────────┘
+          │      ┌──────────────────┐    │
+          └──────┤  DECONVOLUTION   ├────┘
+                 └──┬───────────┬───┘
+                    │           │
+    ┌───────────────┴──┐     ┌──┴───────────────┐
+    │     MULTIPLY     │     │     MULTIPLY     │
+    └──────────────────┘     └──────────────────┘
+
+Verify deconvolution node correctly handles
+ multiple output edges on a single output port
+ */
+
+class DeconvMultipleOutputEdges : virtual public SubgraphBaseStaticTest {
+public:
+    void SetUp() override {
+        auto ngPrc = ov::element::f32;
+        const ov::Shape inShape = {2, 12, 7, 7};
+        const ov::Shape weiShape = {12, 6, 3, 3};
+        ov::ParameterVector inputParams{std::make_shared<ov::op::v0::Parameter>(ngPrc, inShape),
+                                        std::make_shared<ov::op::v0::Parameter>(ngPrc, weiShape)};
+
+        auto deconv = utils::make_convolution_backprop_data(inputParams[0],
+                                                            inputParams[1],
+                                                            ov::element::f32,
+                                                            ov::Strides({1, 1}),
+                                                            ov::CoordinateDiff({0, 0}),
+                                                            ov::CoordinateDiff({0, 0}),
+                                                            ov::Strides({1, 1}),
+                                                            ov::op::PadType::NOTSET,
+                                                            false);
+        deconv->get_rt_info() = CPUTestsBase::makeCPUInfo({nchw}, {nchw}, {});
+
+        const auto const1 = ov::test::utils::make_constant(ngPrc, std::vector<size_t>{2, 6, 9, 9});
+        const auto const2 = ov::test::utils::make_constant(ngPrc, std::vector<size_t>{2, 6, 9, 9});
+
+        const auto mul1 = utils::make_eltwise(deconv->output(0), const1, utils::EltwiseTypes::MULTIPLY);
+        const auto mul2 = utils::make_eltwise(deconv->output(0), const2, utils::EltwiseTypes::MULTIPLY);
+
+        NodeVector results{mul1, mul2};
+        function = std::make_shared<ov::Model>(results, inputParams, "DeconvMultipleOutputEdges");
+        targetDevice = ov::test::utils::DEVICE_CPU;
+    }
+};
+
+TEST_F(DeconvMultipleOutputEdges, smoke_DeconvMultipleOutputEdges_CPU) {
+    run();
+}
+
+}  // namespace test
+}  // namespace ov