From 015ff79bf5443cb125edfc72a45f312d55af5759 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 13 Jan 2024 09:55:06 +0400
Subject: [PATCH 01/13] Fixed API validator search (#22136)

---
 .../OpenVINODeveloperScriptsConfig.cmake             |  2 ++
 .../api_validator/api_validator.cmake                | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/cmake/developer_package/OpenVINODeveloperScriptsConfig.cmake b/cmake/developer_package/OpenVINODeveloperScriptsConfig.cmake
index fc9abc64b9e4cc..82b556cad6c5a6 100644
--- a/cmake/developer_package/OpenVINODeveloperScriptsConfig.cmake
+++ b/cmake/developer_package/OpenVINODeveloperScriptsConfig.cmake
@@ -206,6 +206,8 @@ set(CMAKE_POLICY_DEFAULT_CMP0025 NEW)
 set(CMAKE_POLICY_DEFAULT_CMP0026 NEW)
 # CMake 3.0+ (2.8.12): MacOS "@rpath" in target's install name
 set(CMAKE_POLICY_DEFAULT_CMP0042 NEW)
+# CMake 3.1+: Simplify variable reference and escape sequence evaluation.
+set(CMAKE_POLICY_DEFAULT_CMP0053 NEW)
 # CMake 3.9+: `RPATH` settings on macOS do not affect `install_name`.
 set(CMAKE_POLICY_DEFAULT_CMP0068 NEW)
 # CMake 3.12+: find_package() uses <PackageName>_ROOT variables.
diff --git a/cmake/developer_package/api_validator/api_validator.cmake b/cmake/developer_package/api_validator/api_validator.cmake
index 4eeb9e1e5e0b7e..090a8f84fbcc65 100644
--- a/cmake/developer_package/api_validator/api_validator.cmake
+++ b/cmake/developer_package/api_validator/api_validator.cmake
@@ -3,15 +3,15 @@
 #
 
 if(WIN32)
-    set(PROGRAMFILES_ENV "ProgramFiles(X86)")
+    set(PROGRAMFILES_ENV "ProgramFiles\(X86\)")
 
     # check that PROGRAMFILES_ENV is defined, because in case of cross-compilation for Windows
     # we don't have such variable
-    if(DEFINED ENV{PROGRAMFILES_ENV})
+    if(DEFINED ENV{${PROGRAMFILES_ENV}})
         file(TO_CMAKE_PATH $ENV{${PROGRAMFILES_ENV}} PROGRAMFILES)
 
         set(WDK_PATHS "${PROGRAMFILES}/Windows Kits/10/bin/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}/x64"
-                    "${PROGRAMFILES}/Windows Kits/10/bin/x64")
+                      "${PROGRAMFILES}/Windows Kits/10/bin/x64")
 
         message(STATUS "Trying to find apivalidator in: ")
         foreach(wdk_path IN LISTS WDK_PATHS)
@@ -19,9 +19,9 @@ if(WIN32)
         endforeach()
 
         find_host_program(ONECORE_API_VALIDATOR
-                        NAMES apivalidator
-                        PATHS ${WDK_PATHS}
-                        DOC "ApiValidator for OneCore compliance")
+                          NAMES apivalidator
+                          PATHS ${WDK_PATHS}
+                          DOC "ApiValidator for OneCore compliance")
 
         if(ONECORE_API_VALIDATOR)
             message(STATUS "Found apivalidator: ${ONECORE_API_VALIDATOR}")

From 0a8f1383826d949c497fe3d05fef9ad2b662fa7e Mon Sep 17 00:00:00 2001
From: Vishniakov Nikolai <nikolai.vishniakov@intel.com>
Date: Sat, 13 Jan 2024 09:33:49 +0100
Subject: [PATCH 02/13] [OV JS] Conditional enabling of JS API (#22139)

* Disable js api building for vcpkg

* Disable JS API by default

* Add disable JS API conditions in features.cmake

* Update cmake/features.cmake

* Update src/bindings/js/CMakeLists.txt

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 cmake/features.cmake                | 3 +++
 src/bindings/js/CMakeLists.txt      | 4 ++++
 src/bindings/js/node/CMakeLists.txt | 4 ----
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/cmake/features.cmake b/cmake/features.cmake
index 6e383edeeb695d..aadd1db976e3d0 100644
--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@@ -177,6 +177,9 @@ ov_dependent_option (ENABLE_SYSTEM_SNAPPY "Enables use of system version of Snap
 ov_dependent_option (ENABLE_PYTHON_PACKAGING "Enables packaging of Python API in APT / YUM" OFF
     "ENABLE_PYTHON;UNIX" OFF)
 
+ov_dependent_option (ENABLE_JS "Enables JS API building" ON
+    "NOT WIN32" OFF)
+
 ov_option(ENABLE_OPENVINO_DEBUG "Enable output for OPENVINO_DEBUG statements" OFF)
 
 if(NOT BUILD_SHARED_LIBS AND ENABLE_OV_TF_FRONTEND)
diff --git a/src/bindings/js/CMakeLists.txt b/src/bindings/js/CMakeLists.txt
index 329a86c2fa6bee..20bf139a54a893 100644
--- a/src/bindings/js/CMakeLists.txt
+++ b/src/bindings/js/CMakeLists.txt
@@ -2,6 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+if(NOT ENABLE_JS)
+    return()
+endif()
+
 project(OpenVINO_JS_API)
 
 add_subdirectory(node)
diff --git a/src/bindings/js/node/CMakeLists.txt b/src/bindings/js/node/CMakeLists.txt
index cc8918155f16d0..fffceb56799a96 100644
--- a/src/bindings/js/node/CMakeLists.txt
+++ b/src/bindings/js/node/CMakeLists.txt
@@ -2,10 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-if(WIN32)
-    return()
-endif()
-
 if(CMAKE_VERSION VERSION_LESS 3.14)
     message(WARNING "JS API is not available with CMake version less than 3.14, skipping")
     return()

From c9738426168abc57b8b677979070443802afd5f4 Mon Sep 17 00:00:00 2001
From: Vitaliy Urusovskij <vitaliy.urusovskij@intel.com>
Date: Sat, 13 Jan 2024 12:54:14 +0400
Subject: [PATCH 03/13] Delete `ngraph` opsets (#22068)

* Delete ngraph opset10-11

* Delete ngraph opset7

* Delete ngraph opset5

* Delete ngraph opset4

* Delete ngraph opset9

* Delete ngraph opset8

* ClangFormat

* Delete ngraph opset6

* Delete NG opset3 usage

* Delete ngraph opset1
---
 src/core/include/ngraph/opsets/opset1.hpp     | 25 ----------
 src/core/include/ngraph/opsets/opset10.hpp    | 25 ----------
 .../include/ngraph/opsets/opset10_tbl.hpp     | 12 -----
 src/core/include/ngraph/opsets/opset11.hpp    | 25 ----------
 .../include/ngraph/opsets/opset11_tbl.hpp     | 12 -----
 src/core/include/ngraph/opsets/opset1_tbl.hpp | 43 ----------------
 src/core/include/ngraph/opsets/opset2.hpp     |  1 -
 src/core/include/ngraph/opsets/opset4.hpp     | 25 ----------
 src/core/include/ngraph/opsets/opset4_tbl.hpp | 12 -----
 src/core/include/ngraph/opsets/opset5.hpp     | 25 ----------
 src/core/include/ngraph/opsets/opset5_tbl.hpp | 12 -----
 src/core/include/ngraph/opsets/opset6.hpp     | 25 ----------
 src/core/include/ngraph/opsets/opset6_tbl.hpp | 12 -----
 src/core/include/ngraph/opsets/opset7.hpp     | 25 ----------
 src/core/include/ngraph/opsets/opset7_tbl.hpp | 12 -----
 src/core/include/ngraph/opsets/opset8.hpp     | 25 ----------
 src/core/include/ngraph/opsets/opset8_tbl.hpp | 12 -----
 src/core/include/ngraph/opsets/opset9.hpp     | 25 ----------
 src/core/include/ngraph/opsets/opset9_tbl.hpp | 12 -----
 src/frontends/onnx/frontend/src/op/gather.hpp |  8 +--
 src/frontends/onnx/frontend/src/op/if.cpp     |  4 +-
 src/frontends/onnx/frontend/src/op/lstm.cpp   |  1 -
 .../src/op/mean_variance_normalization.cpp    |  6 +--
 .../onnx/frontend/src/op/qlinear_conv.cpp     | 11 ++--
 .../onnx/frontend/src/op/qlinear_matmul.cpp   |  6 +--
 .../onnx/frontend/src/op/random_uniform.cpp   | 13 +++--
 .../frontend/src/op/random_uniform_like.cpp   | 13 +++--
 .../onnx/frontend/src/op/roi_align.cpp        | 50 +++++++++----------
 .../onnx/frontend/src/op/softsign.cpp         |  3 +-
 .../src/concat_reorder_inplace.cpp            |  2 -
 .../single_layer_tests/depth_to_space.cpp     |  1 -
 .../single_layer_tests/gather_elements.cpp    |  1 -
 .../behavior/plugin/hetero_query_network.hpp  |  1 -
 .../depth_to_space_transformation.cpp         |  1 -
 .../shared_test_classes/single_layer/grn.hpp  |  2 -
 .../single_layer/memory.hpp                   |  3 --
 .../single_layer/prior_box.hpp                |  2 -
 .../single_layer/prior_box_clustered.hpp      |  2 -
 .../src/single_layer/adaptive_pooling.cpp     |  2 -
 .../src/single_layer/eye.cpp                  |  1 -
 .../src/single_layer/memory.cpp               |  9 ++--
 .../src/single_layer/reverse.cpp              |  2 -
 .../src/single_layer/roi_align.cpp            |  3 --
 .../src/subgraph/parameter_shapeof_result.cpp |  1 -
 44 files changed, 60 insertions(+), 453 deletions(-)
 delete mode 100644 src/core/include/ngraph/opsets/opset1.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset10.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset10_tbl.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset11.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset11_tbl.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset1_tbl.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset4.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset4_tbl.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset5.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset5_tbl.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset6.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset6_tbl.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset7.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset7_tbl.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset8.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset8_tbl.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset9.hpp
 delete mode 100644 src/core/include/ngraph/opsets/opset9_tbl.hpp

diff --git a/src/core/include/ngraph/opsets/opset1.hpp b/src/core/include/ngraph/opsets/opset1.hpp
deleted file mode 100644
index 42b3236287c470..00000000000000
--- a/src/core/include/ngraph/opsets/opset1.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(IN_OV_COMPONENT) && !defined(NGRAPH_LEGACY_HEADER_INCLUDED)
-#    define NGRAPH_LEGACY_HEADER_INCLUDED
-#    ifdef _MSC_VER
-#        pragma message( \
-            "The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    else
-#        warning("The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    endif
-#endif
-
-#include "ngraph/ops.hpp"
-
-namespace ngraph {
-namespace opset1 {
-#define NGRAPH_OP(a, b) using b::a;
-#include "ngraph/opsets/opset1_tbl.hpp"
-#undef NGRAPH_OP
-}  // namespace opset1
-}  // namespace ngraph
diff --git a/src/core/include/ngraph/opsets/opset10.hpp b/src/core/include/ngraph/opsets/opset10.hpp
deleted file mode 100644
index 66b248147aedeb..00000000000000
--- a/src/core/include/ngraph/opsets/opset10.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(IN_OV_COMPONENT) && !defined(NGRAPH_LEGACY_HEADER_INCLUDED)
-#    define NGRAPH_LEGACY_HEADER_INCLUDED
-#    ifdef _MSC_VER
-#        pragma message( \
-            "The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    else
-#        warning("The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    endif
-#endif
-
-#include "ngraph/ops.hpp"
-
-namespace ngraph {
-namespace opset10 {
-#define NGRAPH_OP(a, b) using b::a;
-#include "ngraph/opsets/opset10_tbl.hpp"
-#undef NGRAPH_OP
-}  // namespace opset10
-}  // namespace ngraph
diff --git a/src/core/include/ngraph/opsets/opset10_tbl.hpp b/src/core/include/ngraph/opsets/opset10_tbl.hpp
deleted file mode 100644
index f596071fa6e0b9..00000000000000
--- a/src/core/include/ngraph/opsets/opset10_tbl.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#ifndef NGRAPH_OP
-#    warning "NGRAPH_OP not defined"
-#    define NGRAPH_OP(x, y)
-#endif
-
-#define _OPENVINO_OP_REG NGRAPH_OP
-#include "openvino/opsets/opset10_tbl.hpp"
-#undef _OPENVINO_OP_REG
diff --git a/src/core/include/ngraph/opsets/opset11.hpp b/src/core/include/ngraph/opsets/opset11.hpp
deleted file mode 100644
index a4a36bd2fa2e86..00000000000000
--- a/src/core/include/ngraph/opsets/opset11.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(IN_OV_COMPONENT) && !defined(NGRAPH_LEGACY_HEADER_INCLUDED)
-#    define NGRAPH_LEGACY_HEADER_INCLUDED
-#    ifdef _MSC_VER
-#        pragma message( \
-            "The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    else
-#        warning("The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    endif
-#endif
-
-#include "ngraph/ops.hpp"
-
-namespace ngraph {
-namespace opset11 {
-#define NGRAPH_OP(a, b) using b::a;
-#include "ngraph/opsets/opset11_tbl.hpp"
-#undef NGRAPH_OP
-}  // namespace opset11
-}  // namespace ngraph
diff --git a/src/core/include/ngraph/opsets/opset11_tbl.hpp b/src/core/include/ngraph/opsets/opset11_tbl.hpp
deleted file mode 100644
index c815946ecfd42c..00000000000000
--- a/src/core/include/ngraph/opsets/opset11_tbl.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#ifndef NGRAPH_OP
-#    warning "NGRAPH_OP not defined"
-#    define NGRAPH_OP(x, y)
-#endif
-
-#define _OPENVINO_OP_REG NGRAPH_OP
-#include "openvino/opsets/opset11_tbl.hpp"
-#undef _OPENVINO_OP_REG
diff --git a/src/core/include/ngraph/opsets/opset1_tbl.hpp b/src/core/include/ngraph/opsets/opset1_tbl.hpp
deleted file mode 100644
index 955a5311d0c397..00000000000000
--- a/src/core/include/ngraph/opsets/opset1_tbl.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-// This collection contains one entry for each op. If an op is added it must be
-// added to this list.
-//
-// In order to use this list you want to define a macro named exactly NGRAPH_OP
-// When you are done you should undef the macro
-// As an example if you wanted to make a list of all op names as strings you could do this:
-//
-// #define NGRAPH_OP(a,b) #a,
-// std::vector<std::string> op_names{
-// #include "this include file name"
-// };
-// #undef NGRAPH_OP
-//
-// This sample expands to a list like this:
-// "Abs",
-// "Acos",
-// ...
-//
-// #define NGRAPH_OP(a,b) b::a,
-// std::vector<std::string> op_names{
-// #include "this include file name"
-// };
-// #undef NGRAPH_OP
-//
-// This sample expands to a list like this:
-// ngraph::op::Abs,
-// ngraph::op::Acos,
-// ...
-//
-// It's that easy. You can use this for fun and profit.
-
-#ifndef NGRAPH_OP
-#    warning "NGRAPH_OP not defined"
-#    define NGRAPH_OP(x, y)
-#endif
-
-#define _OPENVINO_OP_REG NGRAPH_OP
-#include "openvino/opsets/opset1_tbl.hpp"
-#undef _OPENVINO_OP_REG
diff --git a/src/core/include/ngraph/opsets/opset2.hpp b/src/core/include/ngraph/opsets/opset2.hpp
index 898c8fdfc5c1b2..d2f09479f941a2 100644
--- a/src/core/include/ngraph/opsets/opset2.hpp
+++ b/src/core/include/ngraph/opsets/opset2.hpp
@@ -15,7 +15,6 @@
 #endif
 
 #include "ngraph/ops.hpp"
-#include "ngraph/opsets/opset1.hpp"
 
 namespace ngraph {
 namespace opset2 {
diff --git a/src/core/include/ngraph/opsets/opset4.hpp b/src/core/include/ngraph/opsets/opset4.hpp
deleted file mode 100644
index 14cb115f0889f8..00000000000000
--- a/src/core/include/ngraph/opsets/opset4.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(IN_OV_COMPONENT) && !defined(NGRAPH_LEGACY_HEADER_INCLUDED)
-#    define NGRAPH_LEGACY_HEADER_INCLUDED
-#    ifdef _MSC_VER
-#        pragma message( \
-            "The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    else
-#        warning("The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    endif
-#endif
-
-#include "ngraph/ops.hpp"
-
-namespace ngraph {
-namespace opset4 {
-#define NGRAPH_OP(a, b) using b::a;
-#include "ngraph/opsets/opset4_tbl.hpp"
-#undef NGRAPH_OP
-}  // namespace opset4
-}  // namespace ngraph
diff --git a/src/core/include/ngraph/opsets/opset4_tbl.hpp b/src/core/include/ngraph/opsets/opset4_tbl.hpp
deleted file mode 100644
index 2001838ce9cc43..00000000000000
--- a/src/core/include/ngraph/opsets/opset4_tbl.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#ifndef NGRAPH_OP
-#    warning "NGRAPH_OP not defined"
-#    define NGRAPH_OP(x, y)
-#endif
-
-#define _OPENVINO_OP_REG NGRAPH_OP
-#include "openvino/opsets/opset4_tbl.hpp"
-#undef _OPENVINO_OP_REG
diff --git a/src/core/include/ngraph/opsets/opset5.hpp b/src/core/include/ngraph/opsets/opset5.hpp
deleted file mode 100644
index 48cc80ef87cbb6..00000000000000
--- a/src/core/include/ngraph/opsets/opset5.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(IN_OV_COMPONENT) && !defined(NGRAPH_LEGACY_HEADER_INCLUDED)
-#    define NGRAPH_LEGACY_HEADER_INCLUDED
-#    ifdef _MSC_VER
-#        pragma message( \
-            "The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    else
-#        warning("The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    endif
-#endif
-
-#include "ngraph/ops.hpp"
-
-namespace ngraph {
-namespace opset5 {
-#define NGRAPH_OP(a, b) using b::a;
-#include "ngraph/opsets/opset5_tbl.hpp"
-#undef NGRAPH_OP
-}  // namespace opset5
-}  // namespace ngraph
diff --git a/src/core/include/ngraph/opsets/opset5_tbl.hpp b/src/core/include/ngraph/opsets/opset5_tbl.hpp
deleted file mode 100644
index bfd1d93357e981..00000000000000
--- a/src/core/include/ngraph/opsets/opset5_tbl.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#ifndef NGRAPH_OP
-#    warning "NGRAPH_OP not defined"
-#    define NGRAPH_OP(x, y)
-#endif
-
-#define _OPENVINO_OP_REG NGRAPH_OP
-#include "openvino/opsets/opset5_tbl.hpp"
-#undef _OPENVINO_OP_REG
diff --git a/src/core/include/ngraph/opsets/opset6.hpp b/src/core/include/ngraph/opsets/opset6.hpp
deleted file mode 100644
index 29fbc43292681a..00000000000000
--- a/src/core/include/ngraph/opsets/opset6.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(IN_OV_COMPONENT) && !defined(NGRAPH_LEGACY_HEADER_INCLUDED)
-#    define NGRAPH_LEGACY_HEADER_INCLUDED
-#    ifdef _MSC_VER
-#        pragma message( \
-            "The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    else
-#        warning("The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    endif
-#endif
-
-#include "ngraph/ops.hpp"
-
-namespace ngraph {
-namespace opset6 {
-#define NGRAPH_OP(a, b) using b::a;
-#include "ngraph/opsets/opset6_tbl.hpp"
-#undef NGRAPH_OP
-}  // namespace opset6
-}  // namespace ngraph
diff --git a/src/core/include/ngraph/opsets/opset6_tbl.hpp b/src/core/include/ngraph/opsets/opset6_tbl.hpp
deleted file mode 100644
index 26d64306f3224f..00000000000000
--- a/src/core/include/ngraph/opsets/opset6_tbl.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#ifndef NGRAPH_OP
-#    warning "NGRAPH_OP not defined"
-#    define NGRAPH_OP(x, y)
-#endif
-
-#define _OPENVINO_OP_REG NGRAPH_OP
-#include "openvino/opsets/opset6_tbl.hpp"
-#undef _OPENVINO_OP_REG
diff --git a/src/core/include/ngraph/opsets/opset7.hpp b/src/core/include/ngraph/opsets/opset7.hpp
deleted file mode 100644
index 4ca9e2b94d3698..00000000000000
--- a/src/core/include/ngraph/opsets/opset7.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(IN_OV_COMPONENT) && !defined(NGRAPH_LEGACY_HEADER_INCLUDED)
-#    define NGRAPH_LEGACY_HEADER_INCLUDED
-#    ifdef _MSC_VER
-#        pragma message( \
-            "The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    else
-#        warning("The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    endif
-#endif
-
-#include "ngraph/ops.hpp"
-
-namespace ngraph {
-namespace opset7 {
-#define NGRAPH_OP(a, b) using b::a;
-#include "ngraph/opsets/opset7_tbl.hpp"
-#undef NGRAPH_OP
-}  // namespace opset7
-}  // namespace ngraph
diff --git a/src/core/include/ngraph/opsets/opset7_tbl.hpp b/src/core/include/ngraph/opsets/opset7_tbl.hpp
deleted file mode 100644
index 36c535fda2bce6..00000000000000
--- a/src/core/include/ngraph/opsets/opset7_tbl.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#ifndef NGRAPH_OP
-#    warning "NGRAPH_OP not defined"
-#    define NGRAPH_OP(x, y)
-#endif
-
-#define _OPENVINO_OP_REG NGRAPH_OP
-#include "openvino/opsets/opset7_tbl.hpp"
-#undef _OPENVINO_OP_REG
diff --git a/src/core/include/ngraph/opsets/opset8.hpp b/src/core/include/ngraph/opsets/opset8.hpp
deleted file mode 100644
index 5b3e634d6fe071..00000000000000
--- a/src/core/include/ngraph/opsets/opset8.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(IN_OV_COMPONENT) && !defined(NGRAPH_LEGACY_HEADER_INCLUDED)
-#    define NGRAPH_LEGACY_HEADER_INCLUDED
-#    ifdef _MSC_VER
-#        pragma message( \
-            "The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    else
-#        warning("The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    endif
-#endif
-
-#include "ngraph/ops.hpp"
-
-namespace ngraph {
-namespace opset8 {
-#define NGRAPH_OP(a, b) using b::a;
-#include "ngraph/opsets/opset8_tbl.hpp"
-#undef NGRAPH_OP
-}  // namespace opset8
-}  // namespace ngraph
diff --git a/src/core/include/ngraph/opsets/opset8_tbl.hpp b/src/core/include/ngraph/opsets/opset8_tbl.hpp
deleted file mode 100644
index a707e4f861e17e..00000000000000
--- a/src/core/include/ngraph/opsets/opset8_tbl.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#ifndef NGRAPH_OP
-#    warning "NGRAPH_OP not defined"
-#    define NGRAPH_OP(x, y)
-#endif
-
-#define _OPENVINO_OP_REG NGRAPH_OP
-#include "openvino/opsets/opset8_tbl.hpp"
-#undef _OPENVINO_OP_REG
diff --git a/src/core/include/ngraph/opsets/opset9.hpp b/src/core/include/ngraph/opsets/opset9.hpp
deleted file mode 100644
index 78e5e5726d47a3..00000000000000
--- a/src/core/include/ngraph/opsets/opset9.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(IN_OV_COMPONENT) && !defined(NGRAPH_LEGACY_HEADER_INCLUDED)
-#    define NGRAPH_LEGACY_HEADER_INCLUDED
-#    ifdef _MSC_VER
-#        pragma message( \
-            "The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    else
-#        warning("The nGraph API is deprecated and will be removed in the 2024.0 release. For instructions on transitioning to the new API, please refer to https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html")
-#    endif
-#endif
-
-#include "ngraph/ops.hpp"
-
-namespace ngraph {
-namespace opset9 {
-#define NGRAPH_OP(a, b) using b::a;
-#include "ngraph/opsets/opset9_tbl.hpp"
-#undef NGRAPH_OP
-}  // namespace opset9
-}  // namespace ngraph
diff --git a/src/core/include/ngraph/opsets/opset9_tbl.hpp b/src/core/include/ngraph/opsets/opset9_tbl.hpp
deleted file mode 100644
index f77e340a516cd6..00000000000000
--- a/src/core/include/ngraph/opsets/opset9_tbl.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#ifndef NGRAPH_OP
-#    warning "NGRAPH_OP not defined"
-#    define NGRAPH_OP(x, y)
-#endif
-
-#define _OPENVINO_OP_REG NGRAPH_OP
-#include "openvino/opsets/opset9_tbl.hpp"
-#undef _OPENVINO_OP_REG
diff --git a/src/frontends/onnx/frontend/src/op/gather.hpp b/src/frontends/onnx/frontend/src/op/gather.hpp
index 330e8fc434d880..15f826f5f809b6 100644
--- a/src/frontends/onnx/frontend/src/op/gather.hpp
+++ b/src/frontends/onnx/frontend/src/op/gather.hpp
@@ -10,9 +10,9 @@ OPENVINO_SUPPRESS_DEPRECATED_START
 #include <memory>
 
 #include "ngraph/node.hpp"
-#include "ngraph/opsets/opset8.hpp"
 #include "ngraph/validation_util.hpp"
 #include "onnx_import/core/node.hpp"
+#include "openvino/op/gather.hpp"
 
 namespace ngraph {
 namespace onnx_import {
@@ -24,9 +24,9 @@ inline OutputVector gather(const Node& node) {
     auto indices = ng_inputs.at(1);
     auto axis = node.get_attribute_value<int64_t>("axis", 0);
 
-    return {std::make_shared<ngraph::opset8::Gather>(data,
-                                                     indices,
-                                                     default_opset::Constant::create(element::i64, Shape{}, {axis}))};
+    return {std::make_shared<ov::op::v8::Gather>(data,
+                                                 indices,
+                                                 default_opset::Constant::create(element::i64, Shape{}, {axis}))};
 }
 
 }  // namespace set_1
diff --git a/src/frontends/onnx/frontend/src/op/if.cpp b/src/frontends/onnx/frontend/src/op/if.cpp
index 2e1cdf21f217ed..5c50eca7ffedc8 100644
--- a/src/frontends/onnx/frontend/src/op/if.cpp
+++ b/src/frontends/onnx/frontend/src/op/if.cpp
@@ -6,8 +6,8 @@
 
 #include "core/graph.hpp"
 #include "ngraph/node.hpp"
-#include "ngraph/opsets/opset8.hpp"
 #include "openvino/frontend/exception.hpp"
+#include "openvino/op/if.hpp"
 
 OPENVINO_SUPPRESS_DEPRECATED_START
 namespace ngraph {
@@ -30,7 +30,7 @@ OutputVector if_op(const Node& node) {
     auto else_branch =
         std::make_shared<Function>(else_subgraph->get_ov_outputs(), else_params, else_subgraph->get_name());
 
-    auto if_node = std::make_shared<ngraph::opset8::If>(ng_inputs.at(0));
+    auto if_node = std::make_shared<ov::op::v8::If>(ng_inputs.at(0));
     if_node->set_then_body(then_branch);
     if_node->set_else_body(else_branch);
 
diff --git a/src/frontends/onnx/frontend/src/op/lstm.cpp b/src/frontends/onnx/frontend/src/op/lstm.cpp
index f8255346b9cd39..495d34f119dc6f 100644
--- a/src/frontends/onnx/frontend/src/op/lstm.cpp
+++ b/src/frontends/onnx/frontend/src/op/lstm.cpp
@@ -18,7 +18,6 @@
 #include "ngraph/op/constant.hpp"
 #include "ngraph/op/lstm_sequence.hpp"
 #include "ngraph/op/util/attr_types.hpp"
-#include "ngraph/opsets/opset3.hpp"
 #include "ngraph/shape.hpp"
 #include "ngraph/type/element_type.hpp"
 #include "onnx_import/core/null_node.hpp"
diff --git a/src/frontends/onnx/frontend/src/op/mean_variance_normalization.cpp b/src/frontends/onnx/frontend/src/op/mean_variance_normalization.cpp
index 6a1dca496f6d2c..6c94dabce3be4d 100644
--- a/src/frontends/onnx/frontend/src/op/mean_variance_normalization.cpp
+++ b/src/frontends/onnx/frontend/src/op/mean_variance_normalization.cpp
@@ -10,7 +10,6 @@
 #include "default_opset.hpp"
 #include "ngraph/axis_set.hpp"
 #include "ngraph/op/mvn.hpp"
-#include "ngraph/opsets/opset5.hpp"
 #include "ngraph/validation_util.hpp"
 
 OPENVINO_SUPPRESS_DEPRECATED_START
@@ -23,7 +22,7 @@ OutputVector mean_variance_normalization(const Node& node) {
     bool across_channels = node.get_attribute_value<std::int64_t>("across_channels", 0);
     bool normalize_variance = node.get_attribute_value<std::int64_t>("normalize_variance", 1);
 
-    return {std::make_shared<ngraph::opset5::MVN>(data, across_channels, normalize_variance)};
+    return {std::make_shared<ov::op::v0::MVN>(data, across_channels, normalize_variance)};
 }
 
 }  // namespace set_1
@@ -37,8 +36,7 @@ OutputVector mean_variance_normalization(const Node& node) {
         ngraph::normalize_axes(node.get_description(), axes, data.get_partial_shape().rank());
     OPENVINO_SUPPRESS_DEPRECATED_END
     auto const_axes = default_opset::Constant::create(element::i64, Shape{normalized_axes.size()}, normalized_axes);
-    return {
-        std::make_shared<ngraph::op::v6::MVN>(data, const_axes, true, 1e-09f, ngraph::op::MVNEpsMode::OUTSIDE_SQRT)};
+    return {std::make_shared<ov::op::v6::MVN>(data, const_axes, true, 1e-09f, ngraph::op::MVNEpsMode::OUTSIDE_SQRT)};
 }
 
 }  // namespace set_9
diff --git a/src/frontends/onnx/frontend/src/op/qlinear_conv.cpp b/src/frontends/onnx/frontend/src/op/qlinear_conv.cpp
index 2fe98f98fcb2b5..91dd6ff10f4440 100644
--- a/src/frontends/onnx/frontend/src/op/qlinear_conv.cpp
+++ b/src/frontends/onnx/frontend/src/op/qlinear_conv.cpp
@@ -14,8 +14,9 @@
 #include "conv.hpp"
 #include "dequantize_linear.hpp"
 #include "exceptions.hpp"
-#include "ngraph/opsets/opset6.hpp"
 #include "onnx_import/core/null_node.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/multiply.hpp"
 #include "quantize_linear.hpp"
 
 OPENVINO_SUPPRESS_DEPRECATED_START
@@ -38,18 +39,18 @@ OutputVector qlinear_conv(const Node& node) {
 
     x = set_13::detail::dequantize_linear(x,
                                           x_scale,
-                                          std::make_shared<opset6::Convert>(x_zero_point, element::f32),
+                                          std::make_shared<ov::op::v0::Convert>(x_zero_point, element::f32),
                                           1,
                                           node)[0];
     w = set_13::detail::dequantize_linear(w,
                                           w_scale,
-                                          std::make_shared<opset6::Convert>(w_zero_point, element::f32),
+                                          std::make_shared<ov::op::v0::Convert>(w_zero_point, element::f32),
                                           1,
                                           node)[0];
 
     if (!ov::op::util::is_null(B)) {
-        B = std::make_shared<opset6::Multiply>(std::make_shared<opset6::Convert>(B, x_scale.get_element_type()),
-                                               std::make_shared<opset6::Multiply>(x_scale, w_scale))
+        B = std::make_shared<ov::op::v1::Multiply>(std::make_shared<ov::op::v0::Convert>(B, x_scale.get_element_type()),
+                                                   std::make_shared<ov::op::v1::Multiply>(x_scale, w_scale))
                 ->output(0);
     }
 
diff --git a/src/frontends/onnx/frontend/src/op/qlinear_matmul.cpp b/src/frontends/onnx/frontend/src/op/qlinear_matmul.cpp
index c6f0077e7cbac8..67dde8b5b8908a 100644
--- a/src/frontends/onnx/frontend/src/op/qlinear_matmul.cpp
+++ b/src/frontends/onnx/frontend/src/op/qlinear_matmul.cpp
@@ -10,7 +10,7 @@
 
 #include "dequantize_linear.hpp"
 #include "matmul.hpp"
-#include "ngraph/opsets/opset6.hpp"
+#include "openvino/op/convert.hpp"
 #include "quantize_linear.hpp"
 #include "utils/reshape.hpp"
 
@@ -34,13 +34,13 @@ OutputVector qlinear_matmul(const Node& node) {
     const auto& dequnatize_a =
         set_13::detail::dequantize_linear(a,
                                           a_scale,
-                                          std::make_shared<opset6::Convert>(a_zero_point, element::f32),
+                                          std::make_shared<ov::op::v0::Convert>(a_zero_point, element::f32),
                                           1,
                                           node);
     const auto& dequnatize_b =
         set_13::detail::dequantize_linear(b,
                                           b_scale,
-                                          std::make_shared<opset6::Convert>(b_zero_point, element::f32),
+                                          std::make_shared<ov::op::v0::Convert>(b_zero_point, element::f32),
                                           1,
                                           node);
 
diff --git a/src/frontends/onnx/frontend/src/op/random_uniform.cpp b/src/frontends/onnx/frontend/src/op/random_uniform.cpp
index e07ddcc1c08585..95ab25b8f79470 100644
--- a/src/frontends/onnx/frontend/src/op/random_uniform.cpp
+++ b/src/frontends/onnx/frontend/src/op/random_uniform.cpp
@@ -7,7 +7,6 @@
 #include "default_opset.hpp"
 #include "exceptions.hpp"
 #include "ngraph/op/constant.hpp"
-#include "ngraph/opsets/opset8.hpp"
 #include "ngraph/shape.hpp"
 #include "utils/common.hpp"
 
@@ -32,12 +31,12 @@ OutputVector random_uniform(const Node& node) {
     // TODO: This multiplication leads to a mismatch in accuracy. Issue: 123003
     const auto seed_uint64 = static_cast<uint64_t>(seed * 1000);
 
-    return {std::make_shared<ngraph::opset8::RandomUniform>(target_shape_const,
-                                                            low_const,
-                                                            high_const,
-                                                            target_type,
-                                                            global_seed,
-                                                            seed_uint64)};
+    return {std::make_shared<ov::op::v8::RandomUniform>(target_shape_const,
+                                                        low_const,
+                                                        high_const,
+                                                        target_type,
+                                                        global_seed,
+                                                        seed_uint64)};
 }
 
 }  // namespace set_1
diff --git a/src/frontends/onnx/frontend/src/op/random_uniform_like.cpp b/src/frontends/onnx/frontend/src/op/random_uniform_like.cpp
index d3768f3127e5ae..6fbaba619cf5dc 100644
--- a/src/frontends/onnx/frontend/src/op/random_uniform_like.cpp
+++ b/src/frontends/onnx/frontend/src/op/random_uniform_like.cpp
@@ -7,7 +7,6 @@
 #include "default_opset.hpp"
 #include "exceptions.hpp"
 #include "ngraph/op/constant.hpp"
-#include "ngraph/opsets/opset8.hpp"
 #include "ngraph/shape.hpp"
 #include "utils/common.hpp"
 
@@ -38,12 +37,12 @@ OutputVector random_uniform_like(const Node& node) {
     const uint64_t global_seed = 0;
     const auto seed_uint64 = static_cast<uint64_t>(seed * 1000);
 
-    return {std::make_shared<ngraph::opset8::RandomUniform>(target_shape,
-                                                            low_const,
-                                                            high_const,
-                                                            target_type,
-                                                            global_seed,
-                                                            seed_uint64)};
+    return {std::make_shared<ov::op::v8::RandomUniform>(target_shape,
+                                                        low_const,
+                                                        high_const,
+                                                        target_type,
+                                                        global_seed,
+                                                        seed_uint64)};
 }
 
 }  // namespace set_1
diff --git a/src/frontends/onnx/frontend/src/op/roi_align.cpp b/src/frontends/onnx/frontend/src/op/roi_align.cpp
index fe006258c45fd3..fbdb77e0246e3a 100644
--- a/src/frontends/onnx/frontend/src/op/roi_align.cpp
+++ b/src/frontends/onnx/frontend/src/op/roi_align.cpp
@@ -6,8 +6,8 @@
 
 #include <memory>
 
-#include "ngraph/opsets/opset9.hpp"
 #include "openvino/frontend/exception.hpp"
+#include "openvino/op/roi_align.hpp"
 
 OPENVINO_SUPPRESS_DEPRECATED_START
 namespace ngraph {
@@ -28,18 +28,18 @@ OutputVector roi_align(const Node& node) {
     const auto sampling_ratio = static_cast<int>(node.get_attribute_value<int64_t>("sampling_ratio", 1));
     const auto spatial_scale = node.get_attribute_value<float>("spatial_scale", 1.0f);
     const auto mode = node.get_attribute_value<std::string>("mode", "avg");
-    const auto pooling_mode = EnumNames<opset9::ROIAlign::PoolingMode>::as_enum(mode);
-    const auto aligned_mode = opset9::ROIAlign::AlignedMode::ASYMMETRIC;  // Compatible up to ONNX-opset16
-
-    return {std::make_shared<opset9::ROIAlign>(data,
-                                               rois,
-                                               num_rois,
-                                               pooled_h,
-                                               pooled_w,
-                                               sampling_ratio,
-                                               spatial_scale,
-                                               pooling_mode,
-                                               aligned_mode)};
+    const auto pooling_mode = EnumNames<ov::op::v9::ROIAlign::PoolingMode>::as_enum(mode);
+    const auto aligned_mode = ov::op::v9::ROIAlign::AlignedMode::ASYMMETRIC;  // Compatible up to ONNX-opset16
+
+    return {std::make_shared<ov::op::v9::ROIAlign>(data,
+                                                   rois,
+                                                   num_rois,
+                                                   pooled_h,
+                                                   pooled_w,
+                                                   sampling_ratio,
+                                                   spatial_scale,
+                                                   pooling_mode,
+                                                   aligned_mode)};
 }
 }  // namespace set_1
 namespace set_16 {
@@ -57,25 +57,25 @@ OutputVector roi_align(const Node& node) {
     const auto sampling_ratio = node.get_attribute_value<int64_t>("sampling_ratio", 1);
     const auto spatial_scale = node.get_attribute_value<float>("spatial_scale", 1.0f);
     const auto mode = node.get_attribute_value<std::string>("mode", "avg");
-    const auto pooling_mode = EnumNames<opset9::ROIAlign::PoolingMode>::as_enum(mode);
+    const auto pooling_mode = EnumNames<ov::op::v9::ROIAlign::PoolingMode>::as_enum(mode);
 
     const auto coordinate_transformation_mode =
         node.get_attribute_value<std::string>("coordinate_transformation_mode", "");
-    auto aligned_mode = opset9::ROIAlign::AlignedMode::HALF_PIXEL_FOR_NN;  // Match ONNX ROIAlign-16 default
+    auto aligned_mode = ov::op::v9::ROIAlign::AlignedMode::HALF_PIXEL_FOR_NN;  // Match ONNX ROIAlign-16 default
 
     if (coordinate_transformation_mode == "output_half_pixel") {
-        aligned_mode = opset9::ROIAlign::AlignedMode::ASYMMETRIC;
+        aligned_mode = ov::op::v9::ROIAlign::AlignedMode::ASYMMETRIC;
     }
 
-    return {std::make_shared<opset9::ROIAlign>(data,
-                                               rois,
-                                               num_rois,
-                                               static_cast<int>(pooled_h),
-                                               static_cast<int>(pooled_w),
-                                               static_cast<int>(sampling_ratio),
-                                               spatial_scale,
-                                               pooling_mode,
-                                               aligned_mode)};
+    return {std::make_shared<ov::op::v9::ROIAlign>(data,
+                                                   rois,
+                                                   num_rois,
+                                                   static_cast<int>(pooled_h),
+                                                   static_cast<int>(pooled_w),
+                                                   static_cast<int>(sampling_ratio),
+                                                   spatial_scale,
+                                                   pooling_mode,
+                                                   aligned_mode)};
 }
 }  // namespace set_16
 
diff --git a/src/frontends/onnx/frontend/src/op/softsign.cpp b/src/frontends/onnx/frontend/src/op/softsign.cpp
index 6ddee06bfe936b..c6fd91a190b111 100644
--- a/src/frontends/onnx/frontend/src/op/softsign.cpp
+++ b/src/frontends/onnx/frontend/src/op/softsign.cpp
@@ -8,7 +8,6 @@
 #include <vector>
 
 #include "default_opset.hpp"
-#include "ngraph/opsets/opset9.hpp"
 #include "ngraph/shape.hpp"
 
 OPENVINO_SUPPRESS_DEPRECATED_START
@@ -17,7 +16,7 @@ namespace onnx_import {
 namespace op {
 namespace set_1 {
 OutputVector softsign(const Node& node) {
-    return {std::make_shared<ngraph::opset9::SoftSign>(node.get_ng_inputs().at(0))};
+    return {std::make_shared<ov::op::v9::SoftSign>(node.get_ng_inputs().at(0))};
 }
 }  // namespace set_1
 }  // namespace op
diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_reorder_inplace.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_reorder_inplace.cpp
index 72492463da3e46..e72590de5e48d4 100644
--- a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_reorder_inplace.cpp
+++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_reorder_inplace.cpp
@@ -2,8 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include <ngraph/opsets/opset8.hpp>
-
 #include "common_test_utils/node_builders/constant.hpp"
 #include "ov_models/utils/ov_helpers.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/depth_to_space.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/depth_to_space.cpp
index ca7ac19ac93d11..dfa44b5758a4ea 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/depth_to_space.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/depth_to_space.cpp
@@ -3,7 +3,6 @@
 //
 
 #include <vector>
-#include <ngraph/opsets/opset3.hpp>
 
 #include "single_op_tests/depth_to_space.hpp"
 #include "common_test_utils/test_constants.hpp"
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/gather_elements.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/gather_elements.cpp
index ceb5b433de6a54..a27aa05c4f9f69 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/gather_elements.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/gather_elements.cpp
@@ -3,7 +3,6 @@
 //
 
 #include <vector>
-#include <ngraph/opsets/opset6.hpp>
 
 #include "single_op_tests/gather_elements.hpp"
 #include "common_test_utils/test_constants.hpp"
diff --git a/src/tests/functional/plugin/shared/include/behavior/plugin/hetero_query_network.hpp b/src/tests/functional/plugin/shared/include/behavior/plugin/hetero_query_network.hpp
index c34bbc44c0a21e..bbe9239e439d93 100644
--- a/src/tests/functional/plugin/shared/include/behavior/plugin/hetero_query_network.hpp
+++ b/src/tests/functional/plugin/shared/include/behavior/plugin/hetero_query_network.hpp
@@ -6,7 +6,6 @@
 #include "common_test_utils/test_common.hpp"
 
 
-#include <ngraph/opsets/opset8.hpp>
 #include <ie/ie_core.hpp>
 
 using namespace InferenceEngine;
diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/depth_to_space_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/depth_to_space_transformation.cpp
index 84a35de492b7f6..ef1348d619ce46 100644
--- a/src/tests/functional/plugin/shared/src/low_precision_transformations/depth_to_space_transformation.cpp
+++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/depth_to_space_transformation.cpp
@@ -19,7 +19,6 @@
 #include "ov_models/builders.hpp"
 
 #include <ngraph/function.hpp>
-#include <ngraph/opsets/opset1.hpp>
 #include <ngraph/pass/constant_folding.hpp>
 #include <transformations/utils/utils.hpp>
 #include <transformations/init_node_info.hpp>
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/grn.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/grn.hpp
index 585f08f0beaa96..2574eef28c6f7e 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/grn.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/grn.hpp
@@ -17,8 +17,6 @@
 #include "ie_core.hpp"
 #include "ie_precision.hpp"
 
-#include "ngraph/opsets/opset1.hpp"
-
 #include "functional_test_utils/blob_utils.hpp"
 #include "shared_test_classes/base/layer_test_utils.hpp"
 #include "common_test_utils/common_utils.hpp"
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/memory.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/memory.hpp
index d39931e79366a8..01a4e22b9e98fb 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/memory.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/memory.hpp
@@ -9,9 +9,6 @@
 #include <tuple>
 #include <vector>
 
-#include "ngraph/opsets/opset6.hpp"
-#include "ngraph/opsets/opset3.hpp"
-
 #include "shared_test_classes/base/layer_test_utils.hpp"
 
 namespace LayerTestsDefinitions {
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box.hpp
index b1e752c12bae59..218fb3028f67e0 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box.hpp
@@ -17,8 +17,6 @@
 #include "ie_core.hpp"
 #include "ie_precision.hpp"
 
-#include "ngraph/opsets/opset1.hpp"
-
 #include "functional_test_utils/blob_utils.hpp"
 #include "shared_test_classes/base/layer_test_utils.hpp"
 #include "common_test_utils/common_utils.hpp"
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box_clustered.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box_clustered.hpp
index b712b4c9a09b75..60642609388e4a 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box_clustered.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box_clustered.hpp
@@ -17,8 +17,6 @@
 #include "ie_core.hpp"
 #include "ie_precision.hpp"
 
-#include "ngraph/opsets/opset1.hpp"
-
 #include "functional_test_utils/blob_utils.hpp"
 #include "shared_test_classes/base/layer_test_utils.hpp"
 #include "common_test_utils/common_utils.hpp"
diff --git a/src/tests/functional/shared_test_classes/src/single_layer/adaptive_pooling.cpp b/src/tests/functional/shared_test_classes/src/single_layer/adaptive_pooling.cpp
index cb5019388da146..e9bbfaea2d32b5 100644
--- a/src/tests/functional/shared_test_classes/src/single_layer/adaptive_pooling.cpp
+++ b/src/tests/functional/shared_test_classes/src/single_layer/adaptive_pooling.cpp
@@ -2,8 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include <ngraph/opsets/opset8.hpp>
-
 #include "ov_models/builders.hpp"
 #include "common_test_utils/node_builders/constant.hpp"
 #include "shared_test_classes/single_layer/adaptive_pooling.hpp"
diff --git a/src/tests/functional/shared_test_classes/src/single_layer/eye.cpp b/src/tests/functional/shared_test_classes/src/single_layer/eye.cpp
index 484a010da483f3..95105c34b9a91a 100644
--- a/src/tests/functional/shared_test_classes/src/single_layer/eye.cpp
+++ b/src/tests/functional/shared_test_classes/src/single_layer/eye.cpp
@@ -4,7 +4,6 @@
 #include "shared_test_classes/single_layer/eye.hpp"
 
 #include <common_test_utils/ov_tensor_utils.hpp>
-#include <ngraph/opsets/opset9.hpp>
 #include <openvino/op/parameter.hpp>
 #include <openvino/pass/constant_folding.hpp>
 
diff --git a/src/tests/functional/shared_test_classes/src/single_layer/memory.cpp b/src/tests/functional/shared_test_classes/src/single_layer/memory.cpp
index 15a3034e4a1d4b..cec0846756b65b 100644
--- a/src/tests/functional/shared_test_classes/src/single_layer/memory.cpp
+++ b/src/tests/functional/shared_test_classes/src/single_layer/memory.cpp
@@ -10,13 +10,14 @@
 #include <ie_transformations.hpp>
 #include <transformations/control_flow/unroll_tensor_iterator.hpp>
 
-#include "ngraph/opsets/opset7.hpp"
 #include "ngraph/pass/low_latency.hpp"
 #include "openvino/op/util/variable_context.hpp"
 #include "ov_models/builders.hpp"
 
 using namespace ngraph;
-using namespace opset7;
+using ov::op::v1::Add;
+using ov::op::v0::TensorIterator;
+using ov::op::v0::Result;
 
 namespace LayerTestsDefinitions {
 
@@ -191,9 +192,9 @@ void MemoryTest::CreateCommonFunc() {
                                    : VariableInfo{inputShape, ngPrc, "v0"};
     auto variable = std::make_shared<Variable>(variable_info);
     auto read_value = CreateReadValueOp(param.at(0), variable);
-    auto add = std::make_shared<Add>(read_value, param.at(0));
+    auto add = std::make_shared<ov::op::v1::Add>(read_value, param.at(0));
     auto assign = CreateAssignOp(add, variable);
-    auto res = std::make_shared<Result>(add);
+    auto res = std::make_shared<ov::op::v0::Result>(add);
     function = std::make_shared<Function>(ResultVector{res}, SinkVector{assign}, param, "TestMemory");
 }
 
diff --git a/src/tests/functional/shared_test_classes/src/single_layer/reverse.cpp b/src/tests/functional/shared_test_classes/src/single_layer/reverse.cpp
index b6f506092b16c0..eedb35fe746ac2 100644
--- a/src/tests/functional/shared_test_classes/src/single_layer/reverse.cpp
+++ b/src/tests/functional/shared_test_classes/src/single_layer/reverse.cpp
@@ -4,8 +4,6 @@
 
 #include "shared_test_classes/single_layer/reverse.hpp"
 
-#include <ngraph/opsets/opset1.hpp>
-
 #include "ov_models/builders.hpp"
 
 using namespace InferenceEngine;
diff --git a/src/tests/functional/shared_test_classes/src/single_layer/roi_align.cpp b/src/tests/functional/shared_test_classes/src/single_layer/roi_align.cpp
index 683246fc841970..87e02e82dd7f70 100644
--- a/src/tests/functional/shared_test_classes/src/single_layer/roi_align.cpp
+++ b/src/tests/functional/shared_test_classes/src/single_layer/roi_align.cpp
@@ -4,9 +4,6 @@
 
 #include "shared_test_classes/single_layer/roi_align.hpp"
 
-#include <ngraph/opsets/opset3.hpp>
-#include <ngraph/opsets/opset9.hpp>
-
 #include "ov_models/builders.hpp"
 #include "openvino/core/enum_names.hpp"
 
diff --git a/src/tests/functional/shared_test_classes/src/subgraph/parameter_shapeof_result.cpp b/src/tests/functional/shared_test_classes/src/subgraph/parameter_shapeof_result.cpp
index c68222efcf659b..c852fefdb401b7 100644
--- a/src/tests/functional/shared_test_classes/src/subgraph/parameter_shapeof_result.cpp
+++ b/src/tests/functional/shared_test_classes/src/subgraph/parameter_shapeof_result.cpp
@@ -5,7 +5,6 @@
 #include "shared_test_classes/subgraph/parameter_shapeof_result.hpp"
 
 #include <ie_ngraph_utils.hpp>
-#include <ngraph/opsets/opset6.hpp>
 
 namespace SubgraphTestsDefinitions {
 

From e7791d45496542d647be4dd297ed5f7a9e89a503 Mon Sep 17 00:00:00 2001
From: Vishniakov Nikolai <nikolai.vishniakov@intel.com>
Date: Sat, 13 Jan 2024 10:09:37 +0100
Subject: [PATCH 04/13] Avoid DOWNLOAD_EXTRACT_TIMESTAMP warning (#22135)

* Avoid DOWNLOAD_EXTRACT_TIMESTAMP warning

* Change applying policy condition

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82277e5c875cfb..549f7c40a2a6e8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,11 @@ if(POLICY CMP0091)
     cmake_policy(SET CMP0091 NEW) # Enables use of MSVC_RUNTIME_LIBRARY
 endif()
 
+# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24:
+if(POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+endif()
+
 project(OpenVINO DESCRIPTION "OpenVINO toolkit")
 
 find_package(OpenVINODeveloperScripts REQUIRED

From f7849319d6aa927c8dcad76db7cbe5e8e7a46e34 Mon Sep 17 00:00:00 2001
From: Steve Yoo <steve.yoo@intel.com>
Date: Sun, 14 Jan 2024 09:06:49 +0900
Subject: [PATCH 05/13] [GPU] Intial update for CTCGreedyDecoderSeqLen dynamic
 shape and mutiple outputs support (#21564)

* [GPU] Intial update for CTCGreedyDecoderSeqLen dynamic shape and mutiple outputs support (#21564)

- primitve API update for dynamic shape
- add shape inference test cases

* Add functional tests for multiple outputs in dynamic shapes

* Added dynamic shape supports and functional tests for CTCGreedyDecoder

* Fix reshape_inst to access intended input layout

* Add methods of load/save/hash of the primitive (#21564)

* Remove output idx update part to make it another PR (#21564)

* Fix JitConstants to avoid confusion

* Update macros and remove use_multiple_outputs (#21564)

* Update to use #elif (#21564)
---
 .../primitives/ctc_greedy_decoder.hpp         |  17 +-
 .../src/graph/ctc_greedy_decoder.cpp          |  41 ++++
 .../graph/impls/ocl/ctc_greedy_decoder.cpp    |  26 ++-
 .../graph/include/ctc_greedy_decoder_inst.h   |   2 +
 .../cl_kernels/ctc_greedy_decoder_ref.cl      |  10 +-
 .../ctc_greedy_decoder_kernel_base.cpp        |  16 +-
 .../src/plugin/ops/ctc_greedy_decoder.cpp     | 144 ++++++++-----
 .../ctc_greedy_decoder_seq_len.cpp            |   2 +-
 .../dynamic/ctc_greedy_decoder.cpp            | 116 +++++++++++
 .../dynamic/ctc_greedy_decoder_seq_len.cpp    | 190 ++++++++++++++++++
 .../ctc_greedy_decoder_seq_len_si_test.cpp    | 101 ++++++++++
 11 files changed, 604 insertions(+), 61 deletions(-)
 create mode 100644 src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/ctc_greedy_decoder.cpp
 create mode 100644 src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/ctc_greedy_decoder_seq_len.cpp
 create mode 100644 src/plugins/intel_gpu/tests/unit/shape_infer/ctc_greedy_decoder_seq_len_si_test.cpp

diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/ctc_greedy_decoder.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/ctc_greedy_decoder.hpp
index 2cf19ca58cc44f..293a646cc64427 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/ctc_greedy_decoder.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/ctc_greedy_decoder.hpp
@@ -29,7 +29,22 @@ struct ctc_greedy_decoder : public primitive_base<ctc_greedy_decoder> {
         , ctc_merge_repeated(ctc_merge_repeated)
         , output_tensor(output_tensor) {}
 
-    uint32_t blank_index;
+    /// @brief Constructs ctc_greedy_decoder primitive.
+    /// @param id This primitive id.
+    /// @param input Input primitive id (input, sequence_indicators, blank_index(optional)).
+    /// @param ctc_merge_repeated Flag for merging repeated labels during the CTC calculation
+    ctc_greedy_decoder(const primitive_id& id,
+                       const std::vector<input_info>& inputs,
+                       const uint32_t blank_index,
+                       const bool ctc_merge_repeated,
+                       const padding& output_padding = padding(),
+                       data_types output_data_type = data_types::i32,
+                       const size_t num_outputs = 1)
+        : primitive_base(id, inputs, {output_padding}, {optional_data_type{output_data_type}}, num_outputs)
+        , blank_index(blank_index)
+        , ctc_merge_repeated(ctc_merge_repeated) {}
+
+    uint32_t blank_index = UINT32_MAX;
     bool ctc_merge_repeated = false;
     tensor output_tensor;
     primitive_id second_output;
diff --git a/src/plugins/intel_gpu/src/graph/ctc_greedy_decoder.cpp b/src/plugins/intel_gpu/src/graph/ctc_greedy_decoder.cpp
index f43c2131948363..a22abf5dde0874 100644
--- a/src/plugins/intel_gpu/src/graph/ctc_greedy_decoder.cpp
+++ b/src/plugins/intel_gpu/src/graph/ctc_greedy_decoder.cpp
@@ -7,6 +7,9 @@
 #include "json_object.h"
 #include <string>
 
+#include "ctc_greedy_decoder_seq_len_shape_inference.hpp"
+#include "ctc_greedy_decoder_shape_inference.hpp"
+
 namespace cldnn {
 GPU_DEFINE_PRIMITIVE_TYPE_ID(ctc_greedy_decoder)
 
@@ -18,6 +21,44 @@ layout ctc_greedy_decoder_inst::calc_output_layout(ctc_greedy_decoder_node const
     return layout(output_type, input_node_layout.format, prim->output_tensor);
 }
 
+template<typename ShapeType>
+std::vector<layout> ctc_greedy_decoder_inst::calc_output_layouts(ctc_greedy_decoder_node const& /*node*/, const kernel_impl_params& impl_param) {
+    std::vector<layout> layouts;
+
+    auto desc = impl_param.typed_desc<ctc_greedy_decoder>();
+
+    std::vector<ShapeType> input_shapes;
+    for (size_t i = 0; i < desc->input.size(); ++i) {
+        auto input_shape = impl_param.get_input_layout(i).get<ShapeType>();
+        input_shapes.push_back(input_shape);
+    }
+
+    if (desc->num_outputs == 1) {
+        ov::op::v0::CTCGreedyDecoder op;
+
+        std::vector<ShapeType> output_shapes = ov::op::v0::shape_infer(&op, input_shapes);
+
+        auto dt = desc->get_output_data_type(0).value_or(impl_param.get_input_layout(0).data_type);
+        layouts.push_back({output_shapes[0], dt, format::get_default_format(output_shapes[0].size())});
+
+    } else {
+        ov::op::v6::CTCGreedyDecoderSeqLen op;
+
+        std::vector<ShapeType> output_shapes = ov::op::v6::shape_infer(&op, input_shapes);
+
+        for (size_t i = 0; i < desc->num_outputs; ++i) {
+            auto dt = desc->get_output_data_type(i).value_or(impl_param.get_input_layout(i).data_type);
+            layouts.push_back({output_shapes[i], dt, format::get_default_format(output_shapes[i].size())});
+        }
+    }
+
+    return layouts;
+}
+
+template std::vector<layout>
+ctc_greedy_decoder_inst::calc_output_layouts<ov::PartialShape>(ctc_greedy_decoder_node const& node,
+                                                               const kernel_impl_params& impl_param);
+
 std::string ctc_greedy_decoder_inst::to_string(ctc_greedy_decoder_node const& node) {
     auto node_info = node.desc_to_json();
     auto desc = node.get_primitive();
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_greedy_decoder.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_greedy_decoder.cpp
index 3d00be5740b7e8..06ae2c91984d6f 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_greedy_decoder.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/ctc_greedy_decoder.cpp
@@ -31,12 +31,30 @@ struct ctc_greedy_decoder_impl : typed_primitive_impl_ocl<ctc_greedy_decoder> {
         auto has_second_output = !primitive->second_output.empty();
         params.inputs.push_back(convert_data_tensor(impl_param.input_layouts[1]));
         params.merge_repeated = primitive->ctc_merge_repeated;
-        params.blank_index = primitive->blank_index;
-        params.outputs_num = has_second_output ? 2 : 1;
 
-        if (params.outputs_num == 2) {
-            params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1)));
+        bool allow_new_shape_infer = impl_param.get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
+        if (allow_new_shape_infer && primitive->num_outputs == 2) {
+            if (primitive->blank_index == UINT32_MAX) {
+                params.blank_index = impl_param.get_input_layout(0).spatial(1) - 1;
+            } else {
+                params.blank_index = primitive->blank_index;
+            }
+            params.outputs_num = 2;
+            params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1)));
+
+        } else {
+            if (primitive->blank_index == UINT32_MAX) {
+                params.blank_index = impl_param.get_input_layout(0).spatial(1) - 1;
+            } else {
+                params.blank_index = primitive->blank_index;
+            }
+            params.outputs_num = has_second_output ? 2 : 1;
+
+            if (params.outputs_num == 2) {
+                params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1)));
+            }
         }
+
         return {params, optional_params};
     }
 };
diff --git a/src/plugins/intel_gpu/src/graph/include/ctc_greedy_decoder_inst.h b/src/plugins/intel_gpu/src/graph/include/ctc_greedy_decoder_inst.h
index 899e1f3bd0fa5a..2480c24b5b24ff 100644
--- a/src/plugins/intel_gpu/src/graph/include/ctc_greedy_decoder_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/ctc_greedy_decoder_inst.h
@@ -32,6 +32,8 @@ class typed_primitive_inst<ctc_greedy_decoder> : public typed_primitive_inst_bas
     using parent::parent;
 
 public:
+    template<typename ShapeType>
+    static std::vector<layout> calc_output_layouts(ctc_greedy_decoder_node const& /*node*/, const kernel_impl_params& impl_param);
     static layout calc_output_layout(ctc_greedy_decoder_node const& node, kernel_impl_params const& impl_param);
     static std::string to_string(ctc_greedy_decoder_node const& node);
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/ctc_greedy_decoder_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/ctc_greedy_decoder_ref.cl
index 67cdb55b455b89..b33da4d11fe489 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/ctc_greedy_decoder_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/ctc_greedy_decoder_ref.cl
@@ -7,8 +7,10 @@
 KERNEL(ctc_greedy_decoder_ref)(const __global INPUT0_TYPE* probabilities
                               ,const __global INPUT1_TYPE* sequence_indicators
                                     ,__global OUTPUT_TYPE* output_sequences
-#ifdef SECOND_OUTPUT_EXIST
+#ifdef LEGACY_MULTIPLE_OUTPUTS
                                     ,__global INPUT2_TYPE* second_output
+#elif NEW_MULTIPLE_OUTPUTS
+                                    ,__global OUTPUT1_TYPE* second_output
 #endif
                               )
 {
@@ -23,13 +25,13 @@ KERNEL(ctc_greedy_decoder_ref)(const __global INPUT0_TYPE* probabilities
 
         for (int t = 0; t < T_; ++t) {
             // get maximum probability and its index
-#ifdef SECOND_OUTPUT_EXIST
+#if defined LEGACY_MULTIPLE_OUTPUTS || defined NEW_MULTIPLE_OUTPUTS
             if (t >= sequence_indicators[n]) break;
 #else
             if (sequence_indicators[t * N_ + n] == 0) break;
 #endif
             int max_class_idx = 0;
-#ifdef SECOND_OUTPUT_EXIST
+#if defined LEGACY_MULTIPLE_OUTPUTS || defined NEW_MULTIPLE_OUTPUTS
             const __global INPUT0_TYPE* probs = probabilities + n * C_ * T_ + t * C_;
 #else
             const __global INPUT0_TYPE* probs = probabilities + t * C_ * N_ + n * C_;
@@ -51,7 +53,7 @@ KERNEL(ctc_greedy_decoder_ref)(const __global INPUT0_TYPE* probabilities
 
             prev_class_idx = max_class_idx;
         }
-#ifdef SECOND_OUTPUT_EXIST
+#if defined LEGACY_MULTIPLE_OUTPUTS || defined NEW_MULTIPLE_OUTPUTS
         second_output[n] = output_index - n * T_;
 #endif
     }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/ctc_greedy_decoder/ctc_greedy_decoder_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/ctc_greedy_decoder/ctc_greedy_decoder_kernel_base.cpp
index 9b9e4c65c60b98..fa18afc340bc99 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/ctc_greedy_decoder/ctc_greedy_decoder_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/ctc_greedy_decoder/ctc_greedy_decoder_kernel_base.cpp
@@ -18,8 +18,16 @@ JitConstants CTCGreedyDecoderKernelBase::GetJitConstants(const ctc_greedy_decode
     });
 
     if (params.outputs_num == 2) {
+        if (params.inputs.size() == 3) {
+            jit.AddConstants({
+                MakeJitConstant("LEGACY_MULTIPLE_OUTPUTS", 1)
+            });
+        } else {
+            jit.AddConstants({
+                MakeJitConstant("NEW_MULTIPLE_OUTPUTS", 1)
+            });
+        }
         jit.AddConstants({
-            MakeJitConstant("SECOND_OUTPUT_EXIST", 1),
             MakeJitConstant("N_", inp.Batch().v),
             MakeJitConstant("T_", inp.Feature().v)
         });
@@ -73,7 +81,11 @@ KernelsData CTCGreedyDecoderKernelBase::GetCommonKernelsData(const Params& param
                      GetFusedPrimitiveInputsCount(params));
 
     if (orgParams.outputs_num == 2) {
-        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2});
+        if (orgParams.inputs.size() == 3) {
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2});
+        } else {
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 1});
+        }
     }
 
     return {kd};
diff --git a/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp b/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp
index 5558f7be3e61f8..1535170b64c5fe 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp
@@ -43,67 +43,113 @@ static void CreateCommonCTCGreedyDecoderOp(ProgramBuilder& p, const std::shared_
         }
     }
 
-    uint32_t blank_index = static_cast<uint32_t>(op->get_input_shape(0).back() - 1);
-    if (reordered_inputs.size() == 3) {
-        auto blank_index_node = std::dynamic_pointer_cast<ov::op::v0::Constant>(op->get_input_node_shared_ptr(2));
-        if (!blank_index_node) {
-            OPENVINO_THROW("Unsupported blank_index node type in ", op->get_friendly_name(), " (", op->get_type_name(), ")");
+    if (p.use_new_shape_infer()) {
+        size_t num_outputs = op->get_output_size();
+
+        auto get_output_paddings = [&]() {
+            std::vector<cldnn::padding> output_paddings;
+            for (size_t i = 0; i < num_outputs; i++)
+                output_paddings.push_back(cldnn::padding());
+            return output_paddings;
+        };
+
+        auto get_output_data_types = [&]() {
+            std::vector<cldnn::optional_data_type> output_data_types;
+            for (size_t i = 0; i < num_outputs; i++) {
+                auto type = op->get_output_element_type(i);
+                output_data_types.push_back(cldnn::element_type_to_data_type(type));
+            }
+            return output_data_types;
+        };
+
+        uint32_t blank_index = UINT32_MAX;
+        if (reordered_inputs.size() == 3) {
+            auto blank_index_node = std::dynamic_pointer_cast<ov::op::v0::Constant>(op->get_input_node_shared_ptr(2));
+            if (!blank_index_node) {
+                OPENVINO_THROW("Unsupported blank_index node type in ", op->get_friendly_name(), " (", op->get_type_name(), ")");
+            }
+            float val;
+            if (ov::shape_size(blank_index_node->get_output_shape(0)) != 1 || !ov::op::util::get_single_value(blank_index_node, val)) {
+                OPENVINO_THROW("Unsupported parameter size in ", op->get_friendly_name(), " (", op->get_type_name(), ")");
+            }
+            blank_index = static_cast<uint32_t>(val);
+            reordered_inputs.pop_back();
         }
-        float val;
-        if (ov::shape_size(blank_index_node->get_output_shape(0)) != 1 || !ov::op::util::get_single_value(blank_index_node, val)) {
-            OPENVINO_THROW("Unsupported parameter size in ", op->get_friendly_name(), " (", op->get_type_name(), ")");
+
+        auto primitive = cldnn::ctc_greedy_decoder(
+                    layer_type_name_ID(op),
+                    reordered_inputs,
+                    blank_index,
+                    ctc_merge_repeated,
+                    cldnn::padding({0, 0, 0, 0}, 0),
+                    cldnn::element_type_to_data_type(op->get_output_element_type(0)),
+                    op->get_output_size());
+        primitive.output_paddings = get_output_paddings();
+        primitive.output_data_types = get_output_data_types();
+        p.add_primitive(*op, primitive);
+    } else {
+        uint32_t blank_index = static_cast<uint32_t>(op->get_input_shape(0).back() - 1);
+        if (reordered_inputs.size() == 3) {
+            auto blank_index_node = std::dynamic_pointer_cast<ov::op::v0::Constant>(op->get_input_node_shared_ptr(2));
+            if (!blank_index_node) {
+                OPENVINO_THROW("Unsupported blank_index node type in ", op->get_friendly_name(), " (", op->get_type_name(), ")");
+            }
+            float val;
+            if (ov::shape_size(blank_index_node->get_output_shape(0)) != 1 || !ov::op::util::get_single_value(blank_index_node, val)) {
+                OPENVINO_THROW("Unsupported parameter size in ", op->get_friendly_name(), " (", op->get_type_name(), ")");
+            }
+            blank_index = static_cast<uint32_t>(val);
+            reordered_inputs.pop_back();
         }
-        blank_index = static_cast<uint32_t>(val);
-        reordered_inputs.pop_back();
-    }
 
-    std::size_t num_output = op->get_output_size();
+        std::size_t num_output = op->get_output_size();
 
-    std::vector<cldnn::memory::ptr> shared_memory;
-    if (num_output == 2) {
-        auto mutable_precision = op->get_output_element_type(1);
-         if (mutable_precision == ov::element::i64) {
-            mutable_precision = ov::element::i32;
-        }
+        std::vector<cldnn::memory::ptr> shared_memory;
+        if (num_output == 2) {
+            auto mutable_precision = op->get_output_element_type(1);
+            if (mutable_precision == ov::element::i64) {
+                mutable_precision = ov::element::i32;
+            }
 
-        cldnn::layout mutableLayout = cldnn::layout(
-            cldnn::element_type_to_data_type(mutable_precision),
-            cldnn::format::get_default_format(op->get_output_shape(1).size()),
-            tensor_from_dims(op->get_output_shape(1)));
+            cldnn::layout mutableLayout = cldnn::layout(
+                cldnn::element_type_to_data_type(mutable_precision),
+                cldnn::format::get_default_format(op->get_output_shape(1).size()),
+                tensor_from_dims(op->get_output_shape(1)));
 
-        GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
-        shared_memory.emplace_back(p.get_engine().allocate_memory(mutableLayout));
+            GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
+            shared_memory.emplace_back(p.get_engine().allocate_memory(mutableLayout));
 
-        cldnn::primitive_id ctc_gd_mutable_id_w = layer_type_name_ID(op) + "_md_write";
-        auto ctc_gd_mutable_prim = cldnn::mutable_data(ctc_gd_mutable_id_w,
-                                                       shared_memory[0]);
-        p.add_primitive(*op, ctc_gd_mutable_prim);
-        reordered_inputs.push_back(ctc_gd_mutable_id_w);
-    }
+            cldnn::primitive_id ctc_gd_mutable_id_w = layer_type_name_ID(op) + "_md_write";
+            auto ctc_gd_mutable_prim = cldnn::mutable_data(ctc_gd_mutable_id_w,
+                                                        shared_memory[0]);
+            p.add_primitive(*op, ctc_gd_mutable_prim);
+            reordered_inputs.push_back(ctc_gd_mutable_id_w);
+        }
 
-    auto CTCGreedyDecoderLayerName = num_output == 2 ? layer_type_name_ID(op) + ".out0" : layer_type_name_ID(op);
-    auto primitive = cldnn::ctc_greedy_decoder(
-                CTCGreedyDecoderLayerName,
-                reordered_inputs,
-                blank_index,
-                ctc_merge_repeated,
-                tensor_from_dims(op->get_output_shape(0)));
+        auto CTCGreedyDecoderLayerName = num_output == 2 ? layer_type_name_ID(op) + ".out0" : layer_type_name_ID(op);
+        auto primitive = cldnn::ctc_greedy_decoder(
+                    CTCGreedyDecoderLayerName,
+                    reordered_inputs,
+                    blank_index,
+                    ctc_merge_repeated,
+                    tensor_from_dims(op->get_output_shape(0)));
 
-    // GPU primitive supports only i32 as output data type
-    primitive.output_data_types = {cldnn::element_type_to_data_type(ov::element::i32)};
+        // GPU primitive supports only i32 as output data type
+        primitive.output_data_types = {cldnn::element_type_to_data_type(ov::element::i32)};
 
-    if (num_output == 2) {
-        primitive.second_output = reordered_inputs.back().pid;
-    }
+        if (num_output == 2) {
+            primitive.second_output = reordered_inputs.back().pid;
+        }
 
-    p.add_primitive(*op, primitive);
+        p.add_primitive(*op, primitive);
 
-    if (num_output == 2) {
-        cldnn::primitive_id ctc_gd_mutable_id_r = layer_type_name_ID(op) + ".out1";
-        auto ctc_gd_mutable_prim_r = cldnn::mutable_data(ctc_gd_mutable_id_r,
-                                                         { cldnn::input_info(CTCGreedyDecoderLayerName) },
-                                                         shared_memory[0]);
-        p.add_primitive(*op, ctc_gd_mutable_prim_r);
+        if (num_output == 2) {
+            cldnn::primitive_id ctc_gd_mutable_id_r = layer_type_name_ID(op) + ".out1";
+            auto ctc_gd_mutable_prim_r = cldnn::mutable_data(ctc_gd_mutable_id_r,
+                                                            { cldnn::input_info(CTCGreedyDecoderLayerName) },
+                                                            shared_memory[0]);
+            p.add_primitive(*op, ctc_gd_mutable_prim_r);
+        }
     }
 }
 
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/ctc_greedy_decoder_seq_len.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/ctc_greedy_decoder_seq_len.cpp
index 12d318d107d342..4667ac7a1b9aca 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/ctc_greedy_decoder_seq_len.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/ctc_greedy_decoder_seq_len.cpp
@@ -9,7 +9,7 @@
 namespace {
 using ov::test::CTCGreedyDecoderSeqLenLayerTest;
 
-std::vector<std::vector<ov::Shape>> inputShape{{{1, 1, 1}}, {{1, 6, 10}}, {{3, 3, 16}}, {{5, 3, 55}}};
+std::vector<std::vector<ov::Shape>> inputShape{{{1, 28, 41}}, {{1, 1, 1}}, {{1, 6, 10}}, {{3, 3, 16}}, {{5, 3, 55}}};
 
 const std::vector<ov::element::Type> probPrecisions = {
     ov::element::f32,
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/ctc_greedy_decoder.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/ctc_greedy_decoder.cpp
new file mode 100644
index 00000000000000..ba6545f31a95cf
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/ctc_greedy_decoder.cpp
@@ -0,0 +1,116 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <tuple>
+#include <random>
+// #include "single_op_tests/ctc_greedy_decoder.hpp"
+// #include "shared_test_classes/single_op/ctc_greedy_decoder.hpp"
+#include "common_test_utils/test_constants.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+namespace {
+using ov::test::InputShape;
+
+typedef std::tuple<
+    ov::element::Type,          // Model type
+    InputShape,                 // Input shape
+    bool,                       // Merge repeated
+    std::string                 // Device name
+> ctcGreedyDecoderParams;
+
+class CTCGreedyDecoderLayerGPUTest
+    :  public testing::WithParamInterface<ctcGreedyDecoderParams>,
+       virtual public ov::test::SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<ctcGreedyDecoderParams>& obj) {
+        ov::element::Type model_type;
+        InputShape input_shape;
+        std::string targetDevice;
+        bool merge_repeated;
+        std::tie(model_type, input_shape, merge_repeated, targetDevice) = obj.param;
+
+        std::ostringstream result;
+        const char separator = '_';
+
+        result << "IS=(";
+        result << ov::test::utils::partialShape2str({input_shape.first}) << "_" << "TS=(";
+        for (size_t i = 0lu; i < input_shape.second.size(); i++) {
+            result << ov::test::utils::vec2str(input_shape.second[i]) << "_";
+        }
+        result << ")_";
+        result << "netPRC=" << model_type.get_type_name() << separator;
+        result << "merge_repeated=" << std::boolalpha << merge_repeated << separator;
+        result << "trgDev=" << targetDevice;
+
+        return result.str();
+    }
+protected:
+    void SetUp() override {
+        ov::element::Type model_type;
+        InputShape input_shape;
+        bool merge_repeated;
+        std::tie(model_type, input_shape, merge_repeated, targetDevice) = GetParam();
+        inputDynamicShapes = {input_shape.first, {}};
+        for (size_t i = 0; i < input_shape.second.size(); ++i) {
+            targetStaticShapes.push_back({input_shape.second[i], {}});
+        }
+
+        auto param = std::make_shared<ov::op::v0::Parameter>(model_type, inputDynamicShapes.front());
+
+        size_t T = targetStaticShapes[0][0][0];
+        size_t B = targetStaticShapes[0][0][1];
+
+        std::mt19937 gen(1);
+        std::uniform_int_distribution<unsigned long> dist(1, T);
+
+        std::vector<int> sequence_mask_data(B * T, 0);
+        for (size_t b = 0; b < B; b++) {
+            int len = dist(gen);
+            for (int t = 0; t < len; t++) {
+                sequence_mask_data[t * B + b] = 1;
+            }
+        }
+        auto sequence_mask_node = std::make_shared<ov::op::v0::Constant>(model_type, ov::Shape{T, B}, sequence_mask_data);
+
+        auto ctc_greedy_decoder = std::make_shared<ov::op::v0::CTCGreedyDecoder>(param, sequence_mask_node, merge_repeated);
+
+        auto result = std::make_shared<ov::op::v0::Result>(ctc_greedy_decoder);
+        function = std::make_shared<ov::Model>(result, ov::ParameterVector{param}, "CTCGreedyDecoder");
+    }
+};
+
+
+TEST_P(CTCGreedyDecoderLayerGPUTest, Inference) {
+    run();
+};
+
+// Common params
+const std::vector<ov::element::Type> netPrecisions = {
+    ov::element::f32,
+    ov::element::f16
+};
+std::vector<bool> mergeRepeated{true, false};
+
+std::vector<ov::test::InputShape> input_shapes_dynamic = {
+    {
+        {{-1, -1, -1}, {{ 50, 3, 3 }}},
+        {{-1, -1, -1}, {{ 50, 3, 7 }}},
+        {{-1, -1, -1}, {{ 50, 3, 8 }}},
+        {{-1, -1, -1}, {{ 50, 3, 16 }}},
+        {{-1, -1, -1}, {{ 50, 3, 128 }}},
+        {{-1, -1, -1}, {{ 50, 3, 49 }}},
+        {{-1, -1, -1}, {{ 50, 3, 55 }}},
+        {{-1, -1, -1}, {{ 1, 1, 16 }}}
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_CtcGreedyDecoderBasicDynamic,
+                         CTCGreedyDecoderLayerGPUTest,
+                         ::testing::Combine(::testing::ValuesIn(netPrecisions),
+                                            ::testing::ValuesIn(input_shapes_dynamic),
+                                            ::testing::ValuesIn(mergeRepeated),
+                                            ::testing::Values(ov::test::utils::DEVICE_GPU)),
+                         CTCGreedyDecoderLayerGPUTest::getTestCaseName);
+}  // namespace
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/ctc_greedy_decoder_seq_len.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/ctc_greedy_decoder_seq_len.cpp
new file mode 100644
index 00000000000000..0c7b25766d0a63
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/ctc_greedy_decoder_seq_len.cpp
@@ -0,0 +1,190 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+#include "ov_models/utils/ov_helpers.hpp"
+#include "ov_models/builders.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "common_test_utils/test_constants.hpp"
+#include "common_test_utils/ov_tensor_utils.hpp"
+
+using namespace InferenceEngine;
+using namespace ov::test;
+
+namespace GPULayerTestsDefinitions {
+
+typedef std::tuple<
+        InputShape,                // Input shape
+        int,                       // Sequence lengths
+        ov::element::Type,         // Probabilities precision
+        ov::element::Type,         // Indices precision
+        int,                       // Blank index
+        bool,                      // Merge repeated
+        std::string                // Device name
+> ctcGreedyDecoderSeqLenParams;
+
+class CTCGreedyDecoderSeqLenLayerGPUTest
+    : public testing::WithParamInterface<ctcGreedyDecoderSeqLenParams>,
+      virtual public SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<ctcGreedyDecoderSeqLenParams>& obj) {
+        InputShape inputShape;
+        int sequenceLengths;
+        ov::element::Type dataPrecision, indicesPrecision;
+        int blankIndex;
+        bool mergeRepeated;
+        std::string targetDevice;
+        std::tie(inputShape,
+                 sequenceLengths,
+                 dataPrecision,
+                 indicesPrecision,
+                 blankIndex,
+                 mergeRepeated,
+                 targetDevice) = obj.param;
+
+        std::ostringstream result;
+
+        result << "IS=" << ov::test::utils::partialShape2str({inputShape.first}) << "_" << "TS=(";
+        for (const auto& shape : inputShape.second) {
+            result << ov::test::utils::vec2str(shape) << "_";
+        }
+        result << ")_";
+        result << "seqLen=" << sequenceLengths << '_';
+        result << "dataPRC=" << dataPrecision.get_type_name() << '_';
+        result << "idxPRC=" << indicesPrecision.get_type_name() << '_';
+        result << "BlankIdx=" << blankIndex << '_';
+        result << "mergeRepeated=" << std::boolalpha << mergeRepeated << '_';
+        result << "trgDev=" << targetDevice;
+
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        InputShape inputShape;
+        int sequenceLengths;
+        ov::element::Type model_type, indices_type;
+        int blankIndex;
+        bool mergeRepeated;
+        std::tie(inputShape,
+                 sequenceLengths,
+                 model_type,
+                 indices_type,
+                 blankIndex,
+                 mergeRepeated,
+                 targetDevice) = GetParam();
+        inputDynamicShapes = {inputShape.first, {}};
+        for (size_t i = 0; i < inputShape.second.size(); ++i) {
+            targetStaticShapes.push_back({inputShape.second[i], {}});
+        }
+
+        ov::ParameterVector params {std::make_shared<ov::op::v0::Parameter>(model_type, inputDynamicShapes.front())};
+
+        const auto sequenceLenNode = [&] {
+            const size_t B = targetStaticShapes[0][0][0];
+            const size_t T = targetStaticShapes[0][0][1];
+
+            // Cap sequence length up to T
+            const int seqLen = std::min<int>(T, sequenceLengths);
+
+            std::mt19937 gen{42};
+            std::uniform_int_distribution<int> dist(1, seqLen);
+
+            std::vector<int> sequenceLenData(B);
+            for (size_t b = 0; b < B; b++) {
+                const int len = dist(gen);
+                sequenceLenData[b] = len;
+            }
+
+            return std::make_shared<ov::op::v0::Constant>(indices_type, ov::Shape{B}, sequenceLenData);
+        }();
+
+        // Cap blank index up to C - 1
+        int C = targetStaticShapes[0][0][2];
+        blankIndex = std::min(blankIndex, C - 1);
+
+        const auto blankIndexNode = [&] {
+            if (indices_type == ov::element::i32) {
+                const auto blankIdxDataI32 = std::vector<int32_t>{blankIndex};
+                return std::make_shared<ov::op::v0::Constant>(indices_type, ov::Shape{1}, blankIdxDataI32);
+            } else if (indices_type == ov::element::i64) {
+                const auto blankIdxDataI64 = std::vector<int64_t>{blankIndex};
+                return std::make_shared<ov::op::v0::Constant>(indices_type, ov::Shape{1}, blankIdxDataI64);
+            }
+            throw std::logic_error("Unsupported index precision");
+        }();
+
+        auto ctcGreedyDecoderSeqLen = std::make_shared<ov::op::v6::CTCGreedyDecoderSeqLen>(params[0],
+                                                                                           sequenceLenNode,
+                                                                                           blankIndexNode,
+                                                                                           mergeRepeated,
+                                                                                           indices_type,
+                                                                                           indices_type);
+
+        ov::OutputVector results;
+        for (size_t i = 0; i < ctcGreedyDecoderSeqLen->get_output_size(); i++) {
+            results.push_back(std::make_shared<ov::op::v0::Result>(ctcGreedyDecoderSeqLen->output(i)));
+        }
+        function = std::make_shared<ov::Model>(results, params, "CTCGreedyDecoderSeqLen");
+    }
+};
+
+TEST_P(CTCGreedyDecoderSeqLenLayerGPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    run();
+};
+
+namespace {
+
+std::vector<ov::test::InputShape> inputShapeDynamic = {
+    {
+        {{-1, -1, -1}, {{1, 28, 41}}},
+        {{-1, -1, -1}, {{1, 1, 1}}},
+        {{-1, -1, -1}, {{1, 6, 10}}},
+        {{-1, -1, -1}, {{3, 3, 16}}},
+        {{-1, -1, -1}, {{5, 3, 55}}},
+    }
+};
+
+const std::vector<ov::element::Type> probPrecisions = {
+    ov::element::f32,
+    ov::element::f16
+};
+const std::vector<ov::element::Type> idxPrecisions = {
+    ov::element::i32,
+    ov::element::i64
+};
+
+std::vector<bool> mergeRepeated{true, false};
+
+INSTANTIATE_TEST_SUITE_P(smoke_ctc_greedy_decoder_seq_len_dynamic,
+                         CTCGreedyDecoderSeqLenLayerGPUTest,
+                         ::testing::Combine(::testing::ValuesIn(inputShapeDynamic),
+                                            ::testing::Values(10),
+                                            ::testing::ValuesIn(probPrecisions),
+                                            ::testing::ValuesIn(idxPrecisions),
+                                            ::testing::Values(0),
+                                            ::testing::ValuesIn(mergeRepeated),
+                                            ::testing::Values(ov::test::utils::DEVICE_GPU)),
+                         CTCGreedyDecoderSeqLenLayerGPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_ctc_greedy_decoder_seq_len_bi_dynamic,
+                         CTCGreedyDecoderSeqLenLayerGPUTest,
+                         ::testing::Combine(::testing::ValuesIn(std::vector<ov::test::InputShape>{
+                                                {{-1, -1, -1}, {{2, 8, 11}}},
+                                                {{-1, -1, -1}, {{4, 10, 55}}}}),
+                                            ::testing::ValuesIn(std::vector<int>{5, 100}),
+                                            ::testing::ValuesIn(probPrecisions),
+                                            ::testing::ValuesIn(idxPrecisions),
+                                            ::testing::ValuesIn(std::vector<int>{0, 5, 10}),
+                                            ::testing::ValuesIn(mergeRepeated),
+                                            ::testing::Values(ov::test::utils::DEVICE_GPU)),
+                         CTCGreedyDecoderSeqLenLayerGPUTest::getTestCaseName);
+} // namespace
+} // namespace GPULayerTestsDefinitions
diff --git a/src/plugins/intel_gpu/tests/unit/shape_infer/ctc_greedy_decoder_seq_len_si_test.cpp b/src/plugins/intel_gpu/tests/unit/shape_infer/ctc_greedy_decoder_seq_len_si_test.cpp
new file mode 100644
index 00000000000000..045a06005e554a
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/shape_infer/ctc_greedy_decoder_seq_len_si_test.cpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/ctc_greedy_decoder.hpp>
+#include <intel_gpu/primitives/data.hpp>
+
+#include "ctc_greedy_decoder_inst.h"
+
+#include "program_wrapper.h"
+
+using namespace cldnn;
+using namespace ::tests;
+
+namespace shape_infer_tests {
+
+struct ctc_greedy_decoder_seq_len_test_params {
+    std::vector<layout> in_layouts;
+    std::vector<int32_t> blank_index;
+    std::vector<layout> expected_layouts;
+};
+
+class ctc_greedy_decoder_seq_len_test : public testing::TestWithParam<ctc_greedy_decoder_seq_len_test_params> { };
+
+TEST_P(ctc_greedy_decoder_seq_len_test, shape_infer) {
+    auto p = GetParam();
+    auto& engine = get_test_engine();
+
+    std::vector<std::shared_ptr<primitive>> input_prims;
+    std::vector<input_info> input_prim_ids;
+    {
+        auto prim_id = "input";
+        auto input_layout_prim = std::make_shared<input_layout>(prim_id, p.in_layouts[0]);
+        input_prims.push_back(input_layout_prim);
+        input_prim_ids.push_back(input_info(prim_id));
+    }
+
+    for (size_t i = 1; i < p.in_layouts.size(); i++) {
+        auto prim_id = "const" + std::to_string(i);
+        auto prim_mem = engine.allocate_memory(p.in_layouts[i]);
+        if (i == 2)
+            set_values(prim_mem, p.blank_index);
+        auto const_data_prim = std::make_shared<data>(prim_id, prim_mem);
+        input_prims.push_back(const_data_prim);
+        input_prim_ids.push_back(input_info(prim_id));
+    }
+
+    auto ctc_greedy_decoder_seq_len_prim = std::make_shared<ctc_greedy_decoder>(
+                                   "output",
+                                   input_prim_ids,
+                                   p.blank_index[0],
+                                   true,
+                                   padding(),
+                                   data_types::i32,
+                                   2);
+
+    cldnn::program prog(engine);
+    auto& ctc_greedy_decoder_seq_len_node = prog.get_or_create(ctc_greedy_decoder_seq_len_prim);
+    for (auto& prim : input_prims) {
+        auto& input_layout_node = prog.get_or_create(prim);
+        program_wrapper::add_connection(prog, input_layout_node, ctc_greedy_decoder_seq_len_node);
+    }
+
+    auto res = ctc_greedy_decoder_inst::calc_output_layouts<ov::PartialShape>(ctc_greedy_decoder_seq_len_node, *ctc_greedy_decoder_seq_len_node.get_kernel_impl_params());
+
+    ASSERT_EQ(res.size(), 2);
+    for (size_t i = 0; i < p.expected_layouts.size(); i++)
+        ASSERT_EQ(res[i], p.expected_layouts[i]);
+}
+
+INSTANTIATE_TEST_SUITE_P(smoke, ctc_greedy_decoder_seq_len_test,
+    testing::ValuesIn(std::vector<ctc_greedy_decoder_seq_len_test_params>{
+        {
+            {
+                {layout{ov::PartialShape{1, 6, 10}, data_types::f32, format::bfyx}},
+                {layout{ov::PartialShape{1}, data_types::i32, format::bfyx}},
+            },
+            {-1},
+            {
+                {layout{ov::PartialShape{1, 6}, data_types::i32, format::bfyx}},
+                {layout{ov::PartialShape{1}, data_types::i32, format::bfyx}},
+            },
+        },
+        {
+            {
+                {layout{ov::PartialShape{1, 6, 10}, data_types::f32, format::bfyx}},
+                {layout{ov::PartialShape{1}, data_types::i32, format::bfyx}},
+                {layout{ov::PartialShape{1}, data_types::i32, format::bfyx}},
+            },
+            {5},
+            {
+                {layout{ov::PartialShape{1, 6}, data_types::i32, format::bfyx}},
+                {layout{ov::PartialShape{1}, data_types::i32, format::bfyx}},
+            },
+        },
+    }));
+
+}  // namespace shape_infer_tests

From 76338b3789dc4dc23f086915451a8fb047891be5 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Mon, 15 Jan 2024 08:53:05 +0400
Subject: [PATCH 06/13] [GPU] Fixed mem alloc size and pad propagation for kv
 cache opt (#22128)

---
 .../intel_gpu/src/graph/primitive_inst.cpp    | 17 ++++----
 .../subgraph_tests/dynamic/kv_cache.cpp       | 41 +++++++++++++------
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index 937179e14b03f2..73248603fe9058 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -521,10 +521,9 @@ event::ptr primitive_inst::realloc_if_needed() {
     }
 
     // update layout to ensure that it repsects paddings for correct allocation size
-    if (_node->is_type<kv_cache>() && !_impl_params->can_be_optimized()) {
+    if (_node_output_layout.data_padding.get_dynamic_pad_dims() != tensor(0)) {
         const auto current_buf_size = updated_layout.get_buffer_size().sizes();
-        ov::Shape current_shape(current_buf_size.begin(), current_buf_size.end());
-        updated_layout.set_partial_shape(current_shape);
+        updated_layout = layout(ov::Shape(current_buf_size.begin(), current_buf_size.end()), updated_layout.data_type, updated_layout.format);
     }
 
     bool can_reuse_buffer = _outputs[0] && updated_layout.count() <= max_output_layout_size;
@@ -535,12 +534,12 @@ event::ptr primitive_inst::realloc_if_needed() {
         return ev;
     }
 
-    auto current_shape = actual_layout.get_shape();
+    auto current_shape = updated_layout.get_shape();
     auto& sp = *get_network().get_shape_predictor();
-    auto dt_size = ov::element::Type(actual_layout.data_type).bitwidth();
+    auto dt_size = ov::element::Type(updated_layout.data_type).bitwidth();
     auto prealloc_info = sp.predict_preallocation_shape(id(), current_shape, dt_size, can_reuse_buffer);
     if (prealloc_info.first && sp.can_preallocate(ov::shape_size(prealloc_info.second) * dt_size)) {
-        auto new_layout = actual_layout;
+        auto new_layout = updated_layout;
         new_layout.set_partial_shape(prealloc_info.second);
         updated_params.output_layouts[0] = new_layout;
     }
@@ -561,7 +560,7 @@ event::ptr primitive_inst::realloc_if_needed() {
     } else {
         GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
                                <<  " Current buffer_size=" << max_output_layout_size
-                               <<  " Requested buffer_size=" << actual_layout.count() << std::endl;
+                               <<  " Requested buffer_size=" << updated_layout.count() << std::endl;
         _outputs = allocate_outputs(&updated_params, need_reset_output_memory(), true);
         // TODO : need to handle multiple outputs
         max_output_layout_size = updated_params.output_layouts[0].count();
@@ -976,11 +975,15 @@ void primitive_inst::do_runtime_skip_gather() {
         for (int64_t i = 0; i < static_cast<int32_t>(idx_shape[0]); ++i) {
             if (idx_data[i] != i) {
                 GPU_DEBUG_TRACE_DETAIL << "--- Cannot optimize because idx_data [" << i << "] (" << idx_data[i] << ") != " << i << std::endl;
+                if (_impl_params->output_layouts[0].data_padding.get_dynamic_pad_dims() != tensor(0))
+                    _impl_params->output_layouts[0].data_padding = padding();
                 set_can_be_optimized(false);
                 return;
             }
         }
     }
+    // propagate input layout including correct paddings.
+    _impl_params->output_layouts[0] = _impl_params->input_layouts[0];
     GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_gather] " << id() << " : can_be_optimized" << std::endl;
     GPU_DEBUG_TRACE_DETAIL << "            - Input layout : " << _impl_params->get_input_layout(0).to_short_string() << std::endl;
     GPU_DEBUG_TRACE_DETAIL << "            - Indices layout : " << _impl_params->get_input_layout(1).to_short_string() << std::endl;
diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp
index f2e69e1c298683..599f247210d44c 100644
--- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp
+++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp
@@ -254,6 +254,7 @@ class KVCacheTests: public ::testing::Test {
     void test_smoke_multipleIterations_stateful(bool is_caching_test,
                                                 bool fuse_cache_reorder,
                                                 bool build_state_initializer,
+                                                size_t batch = 1,
                                                 ov::element::Type model_element_type = ov::element::f16) {
     #if defined(ANDROID)
         GTEST_SKIP();
@@ -277,7 +278,6 @@ class KVCacheTests: public ::testing::Test {
             properties.insert(ov::cache_dir(cacheDirName));
         }
 
-        const size_t batch = 1;
         const size_t n_heads = 32;
         const size_t n_features = 10;
         const size_t context_size = 20;
@@ -311,14 +311,23 @@ class KVCacheTests: public ::testing::Test {
         auto output0 = model->get_results().at(0);
 
         auto beam_idx_shape = ov::Shape{batch};
-        auto beam_idx_data = ov::Tensor(ov::element::i32, beam_idx_shape);
+
+        auto beam_idx_data_0 = ov::Tensor(ov::element::i32, beam_idx_shape);
+        auto beam_idx_data_1 = ov::Tensor(ov::element::i32, beam_idx_shape);
         for (size_t i = 0; i < batch; i++) {
-            beam_idx_data.data<int32_t>()[i] = i;
+            beam_idx_data_0.data<int32_t>()[i] = i;
+            beam_idx_data_1.data<int32_t>()[i] = batch - i - 1;
         }
 
-        auto get_ref_results = [&ref_model, fuse_cache_reorder, &beam_idx_shape, &beam_idx_data](const ov::Tensor& kv_cache,
-                                                                                                 const ov::Tensor& new_token_data,
-                                                                                                 const ov::Tensor& matmul_data) {
+        std::vector<ov::Tensor> beam_idx_data_array = {
+            beam_idx_data_0,
+            beam_idx_data_1,
+        };
+
+        auto get_ref_results = [&ref_model, fuse_cache_reorder, &beam_idx_shape](const ov::Tensor& kv_cache,
+                                                                                 const ov::Tensor& new_token_data,
+                                                                                 const ov::Tensor& matmul_data,
+                                                                                 const ov::Tensor& beam_idx_data) {
             auto input0 = ref_model->get_parameters().at(0);
             auto input1 = ref_model->get_parameters().at(1);
             auto input2 = ref_model->get_parameters().at(2);
@@ -367,9 +376,6 @@ class KVCacheTests: public ::testing::Test {
 
         infer_request.set_tensor(input0, new_token_input);
         infer_request.set_tensor(input1, matmul_input);
-        if (fuse_cache_reorder) {
-            infer_request.set_tensor(input2, beam_idx_data);
-        }
 
         for (size_t num_repeats = 0; num_repeats < 2; num_repeats++) {
             ov::Tensor ref_kv_cache;
@@ -388,9 +394,13 @@ class KVCacheTests: public ::testing::Test {
                 new_token_data.copy_to(new_token_input);
                 matmul_data.copy_to(matmul_input);
 
+                if (fuse_cache_reorder) {
+                    infer_request.set_tensor(input2, beam_idx_data_array[0]);
+                }
+
                 ref_kv_cache = ov::Tensor(element_type, kv_cache_size_initial);
 
-                auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
+                auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data, beam_idx_data_array[0]);
                 ref_kv_cache = ref_results[0];
 
                 infer_request.infer();
@@ -408,7 +418,11 @@ class KVCacheTests: public ::testing::Test {
                 ov::Shape matmul_in_size_loop = {batch, n_heads, input_tokens, context_length};
                 auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size);
                 auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_loop);
-                auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
+                if (fuse_cache_reorder) {
+                    infer_request.set_tensor(input2, beam_idx_data_array[i % beam_idx_data_array.size()]);
+                }
+
+                auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data, beam_idx_data_array[i % beam_idx_data_array.size()]);
                 ref_kv_cache = ref_results[0];
 
                 new_token_input.set_shape(new_token_data.get_shape());
@@ -461,7 +475,10 @@ TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer_c
 }
 
 TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer_f32) {
-    this->test_smoke_multipleIterations_stateful(false, true, true, ov::element::f32);
+    this->test_smoke_multipleIterations_stateful(false, true, true, 1, ov::element::f32);
+}
+TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer_batch_3) {
+    this->test_smoke_multipleIterations_stateful(false, true, true, 3);
 }
 
 } // namespace

From 75f87ad19b043f42049ec8df75043e07eb55caee Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Mon, 15 Jan 2024 10:50:56 +0400
Subject: [PATCH 07/13] [Snippets] Fixed access by expired ref (#22132)

---
 src/common/snippets/src/lowered/pass/allocate_buffers.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
index 18ef0d09b9704e..d34b442fd33051 100644
--- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
@@ -67,18 +67,19 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const
 bool AllocateBuffers::run(lowered::LinearIR& linear_ir) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AllocateBuffers");
     m_buffer_scratchpad_size = 0;
-    PassPipeline pipeline;
+
     if (m_is_optimized_mode) {
         BufferClusters buffer_clusters;
+        PassPipeline pipeline;
         pipeline.register_pass<EnumerateExpressions>();
         pipeline.register_pass<IdentifyBuffers>();
         pipeline.register_pass<DefineBufferClusters>(buffer_clusters);
         pipeline.register_pass<SolveBufferMemory>(m_buffer_scratchpad_size, buffer_clusters);
         pipeline.register_pass<NormalizeBufferIDs>();
+        pipeline.run(linear_ir);
     } else {
-        pipeline.register_pass<InitBuffersDefault>(m_buffer_scratchpad_size);
+        InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir);
     }
-    pipeline.run(linear_ir);
 
     return m_buffer_scratchpad_size > 0;
 }

From a8311777d1e3bd8143e52c0b171f8763c3a98960 Mon Sep 17 00:00:00 2001
From: Tingqian Li <tingqian.li@intel.com>
Date: Mon, 15 Jan 2024 14:55:13 +0800
Subject: [PATCH 08/13] [CPU] Optimize SDPA's shape inference (#22037)

---
 .../intel_cpu/src/nodes/scaled_attn.cpp       |   3 +-
 .../shape_inference/custom/scaled_attn.cpp    |  68 ++++++++++
 .../shape_inference/custom/scaled_attn.hpp    |  24 ++++
 .../cpu_opset/common/op/sdpa.cpp              |   5 +
 .../custom_shape_infer/custom_shape_infer.cpp |   5 +-
 .../custom_shape_infer/scaled_attn.cpp        | 127 ++++++++++++++++++
 6 files changed, 230 insertions(+), 2 deletions(-)
 create mode 100644 src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.cpp
 create mode 100644 src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.hpp
 create mode 100644 src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/scaled_attn.cpp

diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
index 848fbf8982c4ee..e56d289a20cece 100644
--- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
@@ -15,6 +15,7 @@
 #include "openvino/core/parallel.hpp"
 #include "openvino/op/scaled_dot_product_attention.hpp"
 #include "openvino/util/common_util.hpp"
+#include "shape_inference/custom/scaled_attn.hpp"
 #include "shape_inference/shape_inference_internal_dyn.hpp"
 #include "utils/plain_tensor.hpp"
 
@@ -638,7 +639,7 @@ struct ScaledDotProductAttention::AttentionExecutor : public ScaledDotProductAtt
 };
 
 ScaledDotProductAttention::ScaledDotProductAttention(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context)
-    : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)), m_tmp_reorder(true) {
+    : Node(op, context, SDPAShapeInferFactory(op)), m_tmp_reorder(true) {
     std::string errorMessage;
     if (!isSupportedOperation(op, errorMessage)) {
         OPENVINO_THROW("CPU: " + errorMessage);
diff --git a/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.cpp b/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.cpp
new file mode 100644
index 00000000000000..ba6064d5eab007
--- /dev/null
+++ b/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.cpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "scaled_attn.hpp"
+
+#include "shape_inference/shape_inference_cpu.hpp"
+#include "shape_inference/shape_inference_ngraph.hpp"
+#include "transformations/cpu_opset/common/op/sdpa.hpp"
+#include "utils.hpp"
+
+namespace ov {
+namespace intel_cpu {
+namespace node {
+
+class SDPAShapeInfer : public ShapeInferEmptyPads {
+public:
+    SDPAShapeInfer(const ScaledDotProductAttentionWithKVCache::Config& config) : m_config(config) {}
+
+    IShapeInfer::Result infer(const std::vector<std::reference_wrapper<const VectorDims>>& input_shapes,
+                              const std::unordered_map<size_t, MemoryPtr>& data_dependency) override {
+        const auto& query_dims = input_shapes.front().get();
+        VectorDims present_kv_dims = input_shapes.back().get();
+        const auto& beam_idx_dims = input_shapes.end()[-3].get();
+        const auto& permute_axes = m_config.permute_axes;
+
+        if (permute_axes.empty()) {
+            // [B, H, L, S]
+            present_kv_dims[0] = beam_idx_dims[0];
+            present_kv_dims[2] += query_dims[2];
+            return {{query_dims, present_kv_dims, present_kv_dims}, ShapeInferStatus::success};
+        }
+
+        // permute_axes[0,1,2,3] gives axis indices of B,H,L,S for query & present_kv
+        const size_t batch_index = permute_axes[0];
+        const size_t length_index = permute_axes[2];
+        present_kv_dims[batch_index] = beam_idx_dims[0];
+        present_kv_dims[length_index] += query_dims[length_index];
+
+        auto n_dims = query_dims.size();
+        VectorDims output_dims(n_dims);
+        for (size_t i = 0; i < n_dims; i++) {
+            output_dims[i] = query_dims[permute_axes[i]];
+        }
+        return {{output_dims, present_kv_dims, present_kv_dims}, ShapeInferStatus::success};
+    }
+
+    port_mask_t get_port_mask() const override {
+        return EMPTY_PORT_MASK;
+    }
+
+private:
+    ScaledDotProductAttentionWithKVCache::Config m_config;
+};
+
+ShapeInferPtr SDPAShapeInferFactory::makeShapeInfer() const {
+    if (auto sdpa = std::dynamic_pointer_cast<const ScaledDotProductAttentionWithKVCache>(m_op)) {
+        const auto& config = sdpa->get_config();
+        if (config.output_BLHxS == false)
+            return std::make_shared<SDPAShapeInfer>(config);
+    }
+    // fallback to ngraph shape infer on non-perf-critical case
+    return std::make_shared<NgraphShapeInfer>(make_shape_inference(m_op), EMPTY_PORT_MASK);
+}
+
+}  // namespace node
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.hpp b/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.hpp
new file mode 100644
index 00000000000000..8b8e06acb268f9
--- /dev/null
+++ b/src/plugins/intel_cpu/src/shape_inference/custom/scaled_attn.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <node.h>
+
+#include "shape_inference/shape_inference_cpu.hpp"
+
+#pragma once
+namespace ov {
+namespace intel_cpu {
+namespace node {
+
+class SDPAShapeInferFactory : public ShapeInferFactory {
+public:
+    SDPAShapeInferFactory(std::shared_ptr<ov::Node> op) : m_op(op) {}
+    ShapeInferPtr makeShapeInfer() const override;
+
+private:
+    std::shared_ptr<ov::Node> m_op;
+};
+}  // namespace node
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/sdpa.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/sdpa.cpp
index 31bce21d3579d3..0f780594934105 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/sdpa.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/sdpa.cpp
@@ -28,6 +28,8 @@ void ov::intel_cpu::ScaledDotProductAttentionWithKVCache::validate_and_infer_typ
     auto q_ps = get_input_partial_shape(0);
     // [B, H, L0, S]
     auto past_kv_ps = get_input_partial_shape(input_num - 1);
+    // [present_kv_batch_size]
+    auto beam_idx_ps = get_input_partial_shape(input_num - 3);
 
     auto output_logits = q_ps;
     NODE_VALIDATION_CHECK(this, m_config.output_BLHxS == false);
@@ -35,6 +37,7 @@ void ov::intel_cpu::ScaledDotProductAttentionWithKVCache::validate_and_infer_typ
     // permute_axes from original to [B, H, L, S]
     const auto& permute_axes = this->m_config.permute_axes;
     if (past_kv_ps.rank().is_static()) {
+        const size_t batch_index = permute_axes.empty() ? 0 : permute_axes[0];
         const size_t length_index = permute_axes.empty() ? q_ps.size() - 2 : permute_axes[permute_axes.size() - 2];
         const size_t head_num_index = permute_axes.empty() ? q_ps.size() - 3 : permute_axes[permute_axes.size() - 3];
         NODE_VALIDATION_CHECK(this, q_ps.size() == past_kv_ps.size());
@@ -50,6 +53,8 @@ void ov::intel_cpu::ScaledDotProductAttentionWithKVCache::validate_and_infer_typ
                 continue;
             }
         }
+        // batch_size can be dynamically changed by gather logic
+        past_kv_ps[batch_index] = beam_idx_ps[0];
         past_kv_ps[length_index] += q_ps[length_index];
     }
     if (!permute_axes.empty()) {
diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/custom_shape_infer.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/custom_shape_infer.cpp
index 038e3185235950..b0bf4c384e5693 100644
--- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/custom_shape_infer.cpp
+++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/custom_shape_infer.cpp
@@ -1,9 +1,10 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+#include "custom_shape_infer.hpp"
+
 #include <gtest/gtest.h>
 
-#include "custom_shape_infer.hpp"
 #include "openvino/cc/factory.h"
 #include "openvino/core/partial_shape.hpp"
 #include "openvino/core/type.hpp"
@@ -20,6 +21,7 @@
 #include "shape_inference/custom/priorbox.hpp"
 #include "shape_inference/custom/priorbox_clustered.hpp"
 #include "shape_inference/custom/reshape.hpp"
+#include "shape_inference/custom/scaled_attn.hpp"
 #include "shape_inference/custom/shapeof.hpp"
 #include "shape_inference/custom/strided_slice.hpp"
 #include "shape_inference/custom/transpose.hpp"
@@ -59,6 +61,7 @@ class CustomShapeInferFF : public openvino::cc::Factory<Type, ShapeInferFactory*
     INTEL_CPU_CUSTOM_SHAPE_INFER(node::PriorBoxClusteredShapeInferFactory, Type::PriorBoxClustered);
     INTEL_CPU_CUSTOM_SHAPE_INFER(node::NgramShapeInferFactory, Type::Ngram);
     INTEL_CPU_CUSTOM_SHAPE_INFER(node::GatherShapeInferFactory, Type::Gather);
+    INTEL_CPU_CUSTOM_SHAPE_INFER(node::SDPAShapeInferFactory, Type::ScaledDotProductAttention);
 #undef INTEL_CPU_CUSTOM_SHAPE_INFER
     }
 
diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/scaled_attn.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/scaled_attn.cpp
new file mode 100644
index 00000000000000..25554b718d8d7f
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/custom_shape_infer/scaled_attn.cpp
@@ -0,0 +1,127 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "common_test_utils/test_assertions.hpp"
+#include "custom_shape_infer.hpp"
+#include "openvino/op/ops.hpp"
+#include "transformations/cpu_opset/common/op/sdpa.hpp"
+
+namespace ov {
+namespace intel_cpu {
+namespace unit_test {
+namespace cpu_shape_infer {
+using namespace ov;
+using namespace ov::intel_cpu;
+using namespace testing;
+
+using SDPATestParams = std::tuple<unit_test::ShapeVector,  // Input shapes
+                                  std::vector<size_t>,     // permute_axes
+                                  unit_test::ShapeVector   // Expected output shapes
+                                  >;
+
+class SDPACpuShapeInferenceTest
+    : public unit_test::OpCpuShapeInferenceTest<ov::intel_cpu::ScaledDotProductAttentionWithKVCache>,
+      public WithParamInterface<SDPATestParams> {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<SDPATestParams>& obj) {
+        unit_test::ShapeVector tmp_input_shapes;
+        std::vector<size_t> tmp_permute_axes;
+        unit_test::ShapeVector tmp_exp_shape;
+        std::tie(tmp_input_shapes, tmp_permute_axes, tmp_exp_shape) = obj.param;
+        std::ostringstream result;
+        result << "IS" << ov::test::utils::vec2str(tmp_input_shapes) << "_";
+        result << "permute_axes" << ov::test::utils::vec2str(tmp_permute_axes) << "_";
+        result << "exp_shape" << ov::test::utils::vec2str(tmp_exp_shape);
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        std::tie(input_shapes, permute_axes, output_shapes) = GetParam();
+
+        args.clear();
+        for (const auto& ishape : input_shapes) {
+            args.push_back(std::make_shared<op::v0::Parameter>(element::f32, ishape.get_shape()));
+        }
+    }
+    OutputVector args;
+    std::vector<size_t> permute_axes;
+};
+
+TEST_P(SDPACpuShapeInferenceTest, shape_inference) {
+    ov::intel_cpu::ScaledDotProductAttentionWithKVCache::Config config;
+    config.permute_axes = permute_axes;
+    const auto op = make_op(args, config);
+    unit_test::cpu_test_shape_infer(op.get(), input_shapes, output_shapes);
+}
+
+INSTANTIATE_TEST_SUITE_P(CpuShapeInfer,
+                         SDPACpuShapeInferenceTest,
+                         Values(
+                             // llama
+                             make_tuple(unit_test::ShapeVector{{1, 32, 14, 128},
+                                                               {1, 32, 14, 128},
+                                                               {1, 32, 14, 128},
+                                                               {1, 1, 14, 14},
+                                                               {1},
+                                                               {1, 32, 0, 128},
+                                                               {1, 32, 0, 128}},
+                                        std::vector<size_t>{},
+                                        unit_test::ShapeVector{{1, 32, 14, 128}, {1, 32, 14, 128}, {1, 32, 14, 128}}),
+                             make_tuple(unit_test::ShapeVector{{1, 32, 1, 128},
+                                                               {1, 32, 1, 128},
+                                                               {1, 32, 1, 128},
+                                                               {1, 1, 1, 16},
+                                                               {1},
+                                                               {1, 32, 15, 128},
+                                                               {1, 32, 15, 128}},
+                                        std::vector<size_t>{},
+                                        unit_test::ShapeVector{{1, 32, 1, 128}, {1, 32, 16, 128}, {1, 32, 16, 128}}),
+                             // chatglm
+                             make_tuple(unit_test::ShapeVector{{1, 1, 32, 128},
+                                                               {1, 1, 2, 128},
+                                                               {1, 1, 2, 128},
+                                                               {1, 1, 1, 8},
+                                                               {1},
+                                                               {7, 1, 2, 128},
+                                                               {7, 1, 2, 128}},
+                                        std::vector<size_t>{1, 2, 0, 3},
+                                        unit_test::ShapeVector{{1, 32, 1, 128}, {8, 1, 2, 128}, {8, 1, 2, 128}}),
+                             make_tuple(unit_test::ShapeVector{{7, 1, 32, 128},
+                                                               {7, 1, 2, 128},
+                                                               {7, 1, 2, 128},
+                                                               {1, 1, 7, 7},
+                                                               {1},
+                                                               {0, 1, 2, 128},
+                                                               {0, 1, 2, 128}},
+                                        std::vector<size_t>{1, 2, 0, 3},
+                                        unit_test::ShapeVector{{1, 32, 7, 128}, {7, 1, 2, 128}, {7, 1, 2, 128}}),
+                             // qwen
+                             make_tuple(unit_test::ShapeVector{{1, 1, 32, 128},
+                                                               {1, 1, 32, 128},
+                                                               {1, 1, 32, 128},
+                                                               {1, 1, 1, 5},
+                                                               {1},
+                                                               {1, 4, 32, 128},
+                                                               {1, 4, 32, 128}},
+                                        std::vector<size_t>{0, 2, 1, 3},
+                                        unit_test::ShapeVector{{1, 32, 1, 128}, {1, 5, 32, 128}, {1, 5, 32, 128}}),
+
+                             make_tuple(unit_test::ShapeVector{{1, 4, 32, 128},
+                                                               {1, 4, 32, 128},
+                                                               {1, 4, 32, 128},
+                                                               {1, 1, 4, 4},
+                                                               {1},
+                                                               {1, 0, 32, 128},
+                                                               {1, 0, 32, 128}},
+                                        std::vector<size_t>{0, 2, 1, 3},
+                                        unit_test::ShapeVector{{1, 32, 4, 128}, {1, 4, 32, 128}, {1, 4, 32, 128}})),
+                         SDPACpuShapeInferenceTest::getTestCaseName);
+
+}  // namespace cpu_shape_infer
+}  // namespace unit_test
+}  // namespace intel_cpu
+}  // namespace ov

From 3bf6f11dfdad37a4aea6e9126d685e27c8581885 Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <xuejun.zhai@intel.com>
Date: Mon, 15 Jan 2024 14:59:45 +0800
Subject: [PATCH 09/13] [Core][CPU] Upgrade ie::extension to ov::extension
 (#21915)

---
 .../plugins/create_plugins_hpp.cmake          |   4 +-
 cmake/developer_package/plugins/plugins.cmake |   2 +-
 .../developer_package/plugins/plugins.hpp.in  |  16 +-
 src/core/include/openvino/core/extension.hpp  |  16 +-
 .../interface/ie_iplugin_internal.hpp         |  25 +-
 src/inference/src/dev/core_impl.cpp           |   4 +-
 src/inference/src/dev/core_impl.hpp           |  11 +-
 src/plugins/intel_cpu/src/compiled_model.cpp  |   6 +-
 src/plugins/intel_cpu/src/compiled_model.h    |   3 -
 src/plugins/intel_cpu/src/extension.cpp       | 299 +++++++-----------
 src/plugins/intel_cpu/src/extension.h         |  22 --
 src/plugins/intel_cpu/src/extension_mngr.cpp  |  41 ---
 src/plugins/intel_cpu/src/extension_mngr.h    |  28 --
 src/plugins/intel_cpu/src/graph_context.h     |   8 -
 src/plugins/intel_cpu/src/node.cpp            |   5 +-
 src/plugins/intel_cpu/src/node.h              |   5 -
 src/plugins/intel_cpu/src/nodes/if.h          |   1 -
 .../intel_cpu/src/nodes/tensoriterator.h      |   1 -
 src/plugins/intel_cpu/src/plugin.cpp          |  15 +-
 src/plugins/intel_cpu/src/plugin.h            |   5 -
 src/plugins/intel_cpu/src/serialize.cpp       |  21 +-
 src/plugins/intel_cpu/src/serialize.h         |   4 +-
 .../src/utils/ngraph_transformation.hpp       |   4 +-
 .../tests/unit/graph/memory_state.cpp         |   4 +-
 .../graph/merge_transpose_reorder_test.cpp    |   2 +-
 .../graph/resolve_edge_conflicts_test.cpp     |   2 +-
 .../tests/unit/nodes/reorder_node_test.cpp    |   1 -
 27 files changed, 167 insertions(+), 388 deletions(-)
 delete mode 100644 src/plugins/intel_cpu/src/extension.h
 delete mode 100644 src/plugins/intel_cpu/src/extension_mngr.cpp
 delete mode 100644 src/plugins/intel_cpu/src/extension_mngr.h

diff --git a/cmake/developer_package/plugins/create_plugins_hpp.cmake b/cmake/developer_package/plugins/create_plugins_hpp.cmake
index 1fedf858ce58ca..2c90da6cc9d97c 100644
--- a/cmake/developer_package/plugins/create_plugins_hpp.cmake
+++ b/cmake/developer_package/plugins/create_plugins_hpp.cmake
@@ -42,10 +42,10 @@ foreach(dev_map IN LISTS OV_DEVICE_MAPPING)
 
         # declarations
         set(OV_PLUGINS_DECLARATIONS "${OV_PLUGINS_DECLARATIONS}
-        IE_DEFINE_PLUGIN_CREATE_FUNCTION_DECLARATION(${_OV_CREATE_PLUGIN_FUNC});")
+        OV_DEFINE_PLUGIN_CREATE_FUNCTION_DECLARATION(${_OV_CREATE_PLUGIN_FUNC});")
         if(${actual_dev_name}_AS_EXTENSION)
             set(OV_PLUGINS_DECLARATIONS "${OV_PLUGINS_DECLARATIONS}
-            IE_DEFINE_EXTENSION_CREATE_FUNCTION_DECLARATION(${_OV_CREATE_EXTENSION_FUNC});")
+            OV_DEFINE_EXTENSION_CREATE_FUNCTION_DECLARATION(${_OV_CREATE_EXTENSION_FUNC});")
         else()
             set(_OV_CREATE_EXTENSION_FUNC "nullptr")
         endif()
diff --git a/cmake/developer_package/plugins/plugins.cmake b/cmake/developer_package/plugins/plugins.cmake
index 16a9e935a896c8..a8ee3e47d25497 100644
--- a/cmake/developer_package/plugins/plugins.cmake
+++ b/cmake/developer_package/plugins/plugins.cmake
@@ -80,7 +80,7 @@ function(ov_add_plugin)
             if(OV_PLUGIN_AS_EXTENSION)
                 # to distinguish functions creating extensions objects
                 target_compile_definitions(${OV_PLUGIN_NAME} PRIVATE
-                    IE_CREATE_EXTENSION=CreateExtensionShared${OV_PLUGIN_DEVICE_NAME})
+                    OV_CREATE_EXTENSION=CreateExtensionShared${OV_PLUGIN_DEVICE_NAME})
             endif()
         endif()
 
diff --git a/cmake/developer_package/plugins/plugins.hpp.in b/cmake/developer_package/plugins/plugins.hpp.in
index 224f77c8cb980b..2af0666e7b84a4 100644
--- a/cmake/developer_package/plugins/plugins.hpp.in
+++ b/cmake/developer_package/plugins/plugins.hpp.in
@@ -9,13 +9,23 @@
 
 #ifdef OPENVINO_STATIC_LIBRARY
 
-#include "cpp_interfaces/interface/ie_iplugin_internal.hpp"
+// The Macro used to create extensions for static library
+#define OV_DEFINE_EXTENSION_CREATE_FUNCTION_DECLARATION(_OV_CREATE_EXTENSION_FUNC) \
+    OPENVINO_EXTENSION_C_API void                                                  \
+    _OV_CREATE_EXTENSION_FUNC(std::vector<::ov::Extension::Ptr>& ext)
+
+// The Macro used to create plugin for static library
+#define OV_DEFINE_PLUGIN_CREATE_FUNCTION_DECLARATION(_OV_CREATE_PLUGIN_FUNC) \
+    OPENVINO_PLUGIN_API void                                                 \
+    _OV_CREATE_PLUGIN_FUNC(::std::shared_ptr<::ov::IPlugin> &plugin) noexcept(false)
 
 @OV_PLUGINS_DECLARATIONS@
 
+using CreateExtensionFunc = void(std::vector<::ov::Extension::Ptr>&);
+using CreatePluginEngineFunc = void(std::shared_ptr<::ov::IPlugin>&);
 struct Value {
-    InferenceEngine::CreatePluginEngineFunc * m_create_plugin_func;
-    InferenceEngine::CreateExtensionFunc * m_create_extension_func;
+    CreatePluginEngineFunc * m_create_plugin_func;
+    CreateExtensionFunc * m_create_extension_func;
     std::map<std::string, std::string> m_default_config;
 };
 
diff --git a/src/core/include/openvino/core/extension.hpp b/src/core/include/openvino/core/extension.hpp
index 7e02703e6281a2..a403675ad2e522 100644
--- a/src/core/include/openvino/core/extension.hpp
+++ b/src/core/include/openvino/core/extension.hpp
@@ -28,24 +28,28 @@ class OPENVINO_API Extension {
 
     virtual ~Extension();
 };
+}  // namespace ov
 
+#ifndef OV_CREATE_EXTENSION
 /**
  * @brief The entry point for library with OpenVINO extensions
  *
  * @param vector of extensions
  */
 OPENVINO_EXTENSION_C_API
-void create_extensions(std::vector<Extension::Ptr>&);
+void create_extensions(std::vector<ov::Extension::Ptr>&);
 
-}  // namespace ov
+#    define OV_CREATE_EXTENSION create_extensions
+
+#endif
 
 /**
  * @brief Macro generates the entry point for the library
  *
  * @param vector of extensions
  */
-#define OPENVINO_CREATE_EXTENSIONS(extensions)                             \
-    OPENVINO_EXTENSION_C_API                                               \
-    void ::ov::create_extensions(std::vector<::ov::Extension::Ptr>& ext) { \
-        ext = extensions;                                                  \
+#define OPENVINO_CREATE_EXTENSIONS(extensions)                                                \
+    OPENVINO_EXTENSION_C_API void OV_CREATE_EXTENSION(std::vector<ov::Extension::Ptr>& ext);  \
+    OPENVINO_EXTENSION_C_API void OV_CREATE_EXTENSION(std::vector<ov::Extension::Ptr>& ext) { \
+        ext = extensions;                                                                     \
     }
diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_iplugin_internal.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_iplugin_internal.hpp
index 859e56df154f05..eb0e8d38c46f87 100644
--- a/src/inference/dev_api/cpp_interfaces/interface/ie_iplugin_internal.hpp
+++ b/src/inference/dev_api/cpp_interfaces/interface/ie_iplugin_internal.hpp
@@ -19,6 +19,7 @@
 #include "ie_iextension.h"
 #include "ie_input_info.hpp"
 #include "ie_parameter.hpp"
+#include "openvino/core/extension.hpp"
 #include "openvino/runtime/iplugin.hpp"
 #include "openvino/util/pp.hpp"
 #include "so_ptr.hpp"
@@ -377,16 +378,6 @@ class INFERENCE_ENGINE_1_0_DEPRECATED INFERENCE_ENGINE_API_CLASS(IInferencePlugi
     bool _isNewAPI;                                                    //!< A flag which shows used API
 };
 
-/**
- * @private
- */
-using CreatePluginEngineFunc = void(std::shared_ptr<::ov::IPlugin>&);
-
-/**
- * @private
- */
-using CreateExtensionFunc = void(std::shared_ptr<IExtension>&);
-
 /**
  * @def IE_CREATE_PLUGIN
  * @brief Defines a name of a function creating plugin instance
@@ -428,17 +419,3 @@ convert_plugin(const std::shared_ptr<InferenceEngine::IInferencePlugin>& from);
         ie_plugin->SetVersion(version);                                               \
         plugin = convert_plugin(ie_plugin);                                           \
     }
-
-/**
- * @private
- */
-#define IE_DEFINE_PLUGIN_CREATE_FUNCTION_DECLARATION(_IE_CREATE_PLUGIN_FUNC) \
-    INFERENCE_PLUGIN_API(void)                                               \
-    _IE_CREATE_PLUGIN_FUNC(::std::shared_ptr<::ov::IPlugin>& plugin) noexcept(false)
-
-/**
- * @private
- */
-#define IE_DEFINE_EXTENSION_CREATE_FUNCTION_DECLARATION(_IE_CREATE_EXTENSION_FUNC) \
-    INFERENCE_EXTENSION_API(void)                                                  \
-    _IE_CREATE_EXTENSION_FUNC(::InferenceEngine::IExtensionPtr& ext)
diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp
index ced075a2d89f76..6489f7e4af8b32 100644
--- a/src/inference/src/dev/core_impl.cpp
+++ b/src/inference/src/dev/core_impl.cpp
@@ -703,9 +703,9 @@ ov::Plugin ov::CoreImpl::get_plugin(const std::string& pluginName) const {
 
         if (desc.extensionCreateFunc) {  // static OpenVINO case
             try {
-                InferenceEngine::IExtensionPtr ext;
+                std::vector<ov::Extension::Ptr> ext;
                 desc.extensionCreateFunc(ext);
-                AddExtensionUnsafe(ext);
+                add_extensions_unsafe(ext);
             } catch (const InferenceEngine::GeneralError&) {
                 // the same extension can be registered multiple times - ignore it!
             }
diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp
index 86365f891fcc64..2a4415ad941bd4 100644
--- a/src/inference/src/dev/core_impl.hpp
+++ b/src/inference/src/dev/core_impl.hpp
@@ -26,6 +26,9 @@
 
 namespace ov {
 
+using CreateExtensionFunc = void(std::vector<::ov::Extension::Ptr>&);
+using CreatePluginEngineFunc = void(std::shared_ptr<::ov::IPlugin>&);
+
 const std::string DEFAULT_DEVICE_NAME = "DEFAULT_DEVICE";
 
 struct Parsed {
@@ -123,8 +126,8 @@ class CoreImpl : public InferenceEngine::ICore, public std::enable_shared_from_t
         ov::util::FilePath libraryLocation;
         ov::AnyMap defaultConfig;
         std::vector<ov::util::FilePath> listOfExtentions;
-        InferenceEngine::CreatePluginEngineFunc* pluginCreateFunc = nullptr;
-        InferenceEngine::CreateExtensionFunc* extensionCreateFunc = nullptr;
+        CreatePluginEngineFunc* pluginCreateFunc = nullptr;
+        CreateExtensionFunc* extensionCreateFunc = nullptr;
 
         PluginDescriptor() = default;
 
@@ -136,9 +139,9 @@ class CoreImpl : public InferenceEngine::ICore, public std::enable_shared_from_t
             this->listOfExtentions = listOfExtentions;
         }
 
-        PluginDescriptor(InferenceEngine::CreatePluginEngineFunc* pluginCreateFunc,
+        PluginDescriptor(CreatePluginEngineFunc* pluginCreateFunc,
                          const ov::AnyMap& defaultConfig = {},
-                         InferenceEngine::CreateExtensionFunc* extensionCreateFunc = nullptr) {
+                         CreateExtensionFunc* extensionCreateFunc = nullptr) {
             this->pluginCreateFunc = pluginCreateFunc;
             this->defaultConfig = defaultConfig;
             this->extensionCreateFunc = extensionCreateFunc;
diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp
index f45f1e2a2b70b3..f81f59f94ae418 100644
--- a/src/plugins/intel_cpu/src/compiled_model.cpp
+++ b/src/plugins/intel_cpu/src/compiled_model.cpp
@@ -40,13 +40,11 @@ struct ImmediateSerialExecutor : public ov::threading::ITaskExecutor {
 CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                              const std::shared_ptr<const ov::IPlugin>& plugin,
                              const Config& cfg,
-                             const ExtensionManager::Ptr& extMgr,
                              const bool loaded_from_cache)
     : ov::ICompiledModel::ICompiledModel(model, plugin),
       m_model(model),
       m_plugin(plugin),
       m_cfg{cfg},
-      extensionManager(extMgr),
       m_name{model->get_name()},
       m_loaded_from_cache(loaded_from_cache) {
     bool isFloatModel = !ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(m_model);
@@ -125,7 +123,7 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const {
                         (m_cfg.lpTransformsMode == Config::On) &&
                         ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model);
 
-                    ctx = std::make_shared<GraphContext>(m_cfg, extensionManager, weightsCache, isQuantizedFlag);
+                    ctx = std::make_shared<GraphContext>(m_cfg, weightsCache, isQuantizedFlag);
                 }
                 const std::shared_ptr<const ov::Model> model = m_model;
                 graphLock._graph.CreateGraph(model, ctx);
@@ -306,7 +304,7 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
 }
 
 void CompiledModel::export_model(std::ostream& modelStream) const {
-    ModelSerializer serializer(modelStream, extensionManager);
+    ModelSerializer serializer(modelStream);
     serializer << m_model;
 }
 
diff --git a/src/plugins/intel_cpu/src/compiled_model.h b/src/plugins/intel_cpu/src/compiled_model.h
index 0561d97e556952..d11ece0e8c2aea 100644
--- a/src/plugins/intel_cpu/src/compiled_model.h
+++ b/src/plugins/intel_cpu/src/compiled_model.h
@@ -7,7 +7,6 @@
 #include <string>
 #include <vector>
 
-#include "extension_mngr.h"
 #include "graph.h"
 #include "graph_context.h"
 #include "openvino/runtime/icompiled_model.hpp"
@@ -26,7 +25,6 @@ class CompiledModel : public ov::ICompiledModel {
     CompiledModel(const std::shared_ptr<ov::Model>& model,
                   const std::shared_ptr<const ov::IPlugin>& plugin,
                   const Config& cfg,
-                  const ExtensionManager::Ptr& extMgr,
                   const bool loaded_from_cache = false);
 
     std::shared_ptr<ov::IAsyncInferRequest> create_infer_request() const override;
@@ -55,7 +53,6 @@ class CompiledModel : public ov::ICompiledModel {
     // Usage example: helps to avoid data races during CPU Graph initialization in multi-streams scenario
     std::shared_ptr<std::mutex> m_mutex;
     Config m_cfg;
-    ExtensionManager::Ptr extensionManager;
     mutable std::atomic_int m_numRequests = {0};
     std::string m_name;
     struct GraphGuard : public Graph {
diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
index 9cda1c4fa26175..41d91bfc382681 100644
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@@ -2,204 +2,137 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "extension.h"
+#include "openvino/core/extension.hpp"
+
+#include "openvino/core/op_extension.hpp"
+#include "ov_ops/augru_cell.hpp"
+#include "ov_ops/augru_sequence.hpp"
+#include "ov_ops/multiclass_nms_ie_internal.hpp"
+#include "ov_ops/nms_ie_internal.hpp"
+#include "ov_ops/nms_static_shape_ie.hpp"
+#include "ov_ops/type_relaxed.hpp"
+#include "snippets/op/subgraph.hpp"
 #include "transformations/cpu_opset/common/op/fully_connected.hpp"
 #include "transformations/cpu_opset/common/op/leaky_relu.hpp"
+#include "transformations/cpu_opset/common/op/ngram.hpp"
 #include "transformations/cpu_opset/common/op/power_static.hpp"
 #include "transformations/cpu_opset/common/op/sdpa.hpp"
 #include "transformations/cpu_opset/common/op/swish_cpu.hpp"
-#include "transformations/cpu_opset/common/op/ngram.hpp"
-#include "transformations/cpu_opset/x64/op/mha.hpp"
 #include "transformations/cpu_opset/x64/op/interaction.hpp"
-#include "transformations/snippets/x64/op/load_convert.hpp"
-#include "transformations/snippets/x64/op/store_convert.hpp"
-#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
+#include "transformations/cpu_opset/x64/op/mha.hpp"
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
+#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
+#include "transformations/snippets/x64/op/load_convert.hpp"
 #include "transformations/snippets/x64/op/perf_count_rdtsc.hpp"
+#include "transformations/snippets/x64/op/store_convert.hpp"
 
-#include <ov_ops/augru_cell.hpp>
-#include <ov_ops/augru_sequence.hpp>
-#include <ov_ops/type_relaxed.hpp>
-#include <ov_ops/nms_ie_internal.hpp>
-#include <ov_ops/nms_static_shape_ie.hpp>
-#include <ov_ops/multiclass_nms_ie_internal.hpp>
-
-#include "snippets/op/subgraph.hpp"
-
-#include <mutex>
-
-namespace ov {
-namespace intel_cpu {
-
-void Extension::GetVersion(const InferenceEngine::Version*& versionInfo) const noexcept {
-    static const InferenceEngine::Version version = {
-        {1, 0},             // extension API version
-        "1.0",
-        "Extension"   // extension description message
-    };
-
-    versionInfo = &version;
-}
-
-void Extension::Unload() noexcept {}
-
-std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
-    auto cpu_plugin_opset = []() {
-        ngraph::OpSet opset;
+#define OP_EXTENSION(NAME) std::make_shared<ov::OpExtension<NAME>>(),
 
 #if defined(OPENVINO_ARCH_X86_64)
-#define NGRAPH_OP_X64(NAME, NAMESPACE) NGRAPH_OP(NAME, NAMESPACE)
+#    define OP_EXTENSION_X64(NAME) OP_EXTENSION(NAME)
 #else
-#define NGRAPH_OP_X64(NAME, NAMESPACE)
+#    define OP_EXTENSION_X64(NAME)
 #endif
 
-#define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
-        NGRAPH_OP(FullyConnectedNode, ov::intel_cpu)
-        NGRAPH_OP(LeakyReluNode, ov::intel_cpu)
-        NGRAPH_OP(PowerStaticNode, ov::intel_cpu)
-        NGRAPH_OP(SwishNode, ov::intel_cpu)
-        NGRAPH_OP(NgramNode, ov::intel_cpu)
-        NGRAPH_OP_X64(MHANode, ov::intel_cpu)
-        NGRAPH_OP_X64(InteractionNode, ov::intel_cpu)
-        NGRAPH_OP_X64(ScaledDotProductAttentionWithKVCache, ov::intel_cpu)
-#undef NGRAPH_OP
-
-        return opset;
-    };
-
-    auto type_relaxed_opset = []() {
-        ngraph::OpSet opset;
-
-#define NGRAPH_OP(NAME, NAMESPACE) opset.insert<ov::op::TypeRelaxed<NAMESPACE::NAME>>();
-        NGRAPH_OP(Add, ov::op::v1)
-        NGRAPH_OP(AvgPool, ov::op::v1)
-        NGRAPH_OP(Clamp, ov::op::v0)
-        NGRAPH_OP(Concat, ov::op::v0)
-        NGRAPH_OP(Convolution, ov::op::v1)
-        NGRAPH_OP(ConvolutionBackpropData, ov::op::v1)
-        NGRAPH_OP(DepthToSpace, ov::op::v0)
-        NGRAPH_OP(Equal, ov::op::v1)
-        NGRAPH_OP(FakeQuantize, ov::op::v0)
-        NGRAPH_OP(Greater, ov::op::v1)
-        NGRAPH_OP(GreaterEqual, ov::op::v1)
-        NGRAPH_OP(GroupConvolution, ov::op::v1)
-        NGRAPH_OP(GroupConvolutionBackpropData, ov::op::v1)
-        NGRAPH_OP(Interpolate, ov::op::v0)
-        NGRAPH_OP(Interpolate, ov::op::v4)
-        NGRAPH_OP(Less, ov::op::v1)
-        NGRAPH_OP(LessEqual, ov::op::v1)
-        NGRAPH_OP(LogicalAnd, ov::op::v1)
-        NGRAPH_OP(LogicalNot, ov::op::v1)
-        NGRAPH_OP(LogicalOr, ov::op::v1)
-        NGRAPH_OP(LogicalXor, ov::op::v1)
-        NGRAPH_OP(MatMul, ov::op::v0)
-        NGRAPH_OP(MaxPool, ov::op::v1)
-        NGRAPH_OP(Multiply, ov::op::v1)
-        NGRAPH_OP(NormalizeL2, ov::op::v0)
-        NGRAPH_OP(NotEqual, ov::op::v1)
-        NGRAPH_OP(PRelu, ov::op::v0)
-        NGRAPH_OP(Relu, ov::op::v0)
-        NGRAPH_OP(ReduceMax, ov::op::v1)
-        NGRAPH_OP(ReduceLogicalAnd, ov::op::v1)
-        NGRAPH_OP(ReduceLogicalOr, ov::op::v1)
-        NGRAPH_OP(ReduceMean, ov::op::v1)
-        NGRAPH_OP(ReduceMin, ov::op::v1)
-        NGRAPH_OP(ReduceSum, ov::op::v1)
-        NGRAPH_OP(Reshape, ov::op::v1)
-        NGRAPH_OP(Select, ov::op::v1)
-        NGRAPH_OP(ShapeOf, ov::op::v0)
-        NGRAPH_OP(ShuffleChannels, ov::op::v0)
-        NGRAPH_OP(Squeeze, ov::op::v0)
-        NGRAPH_OP(Subtract, ov::op::v1)
-        NGRAPH_OP(Unsqueeze, ov::op::v0)
-        NGRAPH_OP(MVN, ov::op::v0)
-        NGRAPH_OP(MVN, ov::op::v6)
-        NGRAPH_OP(Select, ov::op::v1)
-        NGRAPH_OP(ConvolutionBackpropData, ov::op::v1)
-#undef NGRAPH_OP
-
-        return opset;
-    };
-
-    auto ie_internal_opset = []() {
-        ngraph::OpSet opset;
-
-#define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
-        NGRAPH_OP(NonMaxSuppressionIEInternal, ov::op::internal)
-        NGRAPH_OP(MulticlassNmsIEInternal, ov::op::internal)
-        NGRAPH_OP(AUGRUCell, ov::op::internal)
-        NGRAPH_OP(AUGRUSequence, ov::op::internal)
-        NGRAPH_OP(NmsStaticShapeIE<ov::op::v8::MatrixNms>, ov::op::internal)
-#undef NGRAPH_OP
-
-        return opset;
-    };
-
-    auto snippets_opset = []() {
-        ngraph::OpSet opset;
+#define CPU_EXTENSIONS                                                      \
+    OP_EXTENSION(ov::intel_cpu::FullyConnectedNode)                         \
+    OP_EXTENSION(ov::intel_cpu::LeakyReluNode)                              \
+    OP_EXTENSION(ov::intel_cpu::PowerStaticNode)                            \
+    OP_EXTENSION(ov::intel_cpu::SwishNode)                                  \
+    OP_EXTENSION(ov::intel_cpu::NgramNode)                                  \
+    OP_EXTENSION(ov::op::internal::NonMaxSuppressionIEInternal)             \
+    OP_EXTENSION(ov::op::internal::MulticlassNmsIEInternal)                 \
+    OP_EXTENSION(ov::op::internal::AUGRUCell)                               \
+    OP_EXTENSION(ov::op::internal::AUGRUSequence)                           \
+    OP_EXTENSION(ov::op::internal::NmsStaticShapeIE<ov::op::v8::MatrixNms>) \
+    OP_EXTENSION_X64(ov::intel_cpu::MHANode)                                \
+    OP_EXTENSION_X64(ov::intel_cpu::InteractionNode)                        \
+    OP_EXTENSION_X64(ov::intel_cpu::ScaledDotProductAttentionWithKVCache)   \
+    OP_EXTENSION_X64(ov::intel_cpu::LoadConvertSaturation)                  \
+    OP_EXTENSION_X64(ov::intel_cpu::LoadConvertTruncation)                  \
+    OP_EXTENSION_X64(ov::intel_cpu::StoreConvertSaturation)                 \
+    OP_EXTENSION_X64(ov::intel_cpu::StoreConvertTruncation)                 \
+    OP_EXTENSION_X64(ov::intel_cpu::BrgemmCPU)                              \
+    OP_EXTENSION_X64(ov::intel_cpu::BrgemmCopyB)
+
+#define TYPE_RELAXED_EXTENSIONS                                                 \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::Add>)                          \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::AvgPool>)                      \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::Clamp>)                        \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::Concat>)                       \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::Convolution>)                  \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::ConvolutionBackpropData>)      \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::DepthToSpace>)                 \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::Equal>)                        \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::FakeQuantize>)                 \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::Greater>)                      \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::GreaterEqual>)                 \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::GroupConvolution>)             \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::GroupConvolutionBackpropData>) \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::Interpolate>)                  \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v4::Interpolate>)                  \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::Less>)                         \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::LessEqual>)                    \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::LogicalAnd>)                   \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::LogicalNot>)                   \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::LogicalOr>)                    \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::LogicalXor>)                   \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::MatMul>)                       \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::MaxPool>)                      \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::Multiply>)                     \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::NormalizeL2>)                  \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::NotEqual>)                     \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::PRelu>)                        \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::Relu>)                         \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::ReduceMax>)                    \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::ReduceLogicalAnd>)             \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::ReduceLogicalOr>)              \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::ReduceMean>)                   \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::ReduceMin>)                    \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::ReduceSum>)                    \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::Reshape>)                      \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::Select>)                       \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::ShapeOf>)                      \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::ShuffleChannels>)              \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::Squeeze>)                      \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v1::Subtract>)                     \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::Unsqueeze>)                    \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v0::MVN>)                          \
+    OP_EXTENSION(ov::op::TypeRelaxed<ov::op::v6::MVN>)
 
-#define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
-        NGRAPH_OP(Brgemm, ov::snippets::op)
-        NGRAPH_OP(BroadcastLoad, ov::snippets::op)
-        NGRAPH_OP(BroadcastMove, ov::snippets::op)
-        NGRAPH_OP(ConvertSaturation, ov::snippets::op)
-        NGRAPH_OP(ConvertTruncation, ov::snippets::op)
-        NGRAPH_OP(Fill, ov::snippets::op)
-        NGRAPH_OP(HorizonMax, ov::snippets::op)
-        NGRAPH_OP(HorizonSum, ov::snippets::op)
-        NGRAPH_OP(Kernel, ov::snippets::op)
-        NGRAPH_OP(IntermediateMemoryBuffer, ov::snippets::op)
-        NGRAPH_OP(Load, ov::snippets::op)
-        NGRAPH_OP(LoadReshape, ov::snippets::op)
-        NGRAPH_OP(LoopBegin, ov::snippets::op)
-        NGRAPH_OP(LoopEnd, ov::snippets::op)
-        NGRAPH_OP(NewMemoryBuffer, ov::snippets::op)
-        NGRAPH_OP(Nop, ov::snippets::op)
-        NGRAPH_OP(PowerStatic, ov::snippets::op)
-        NGRAPH_OP(Scalar, ov::snippets::op)
-        NGRAPH_OP(Store, ov::snippets::op)
-        NGRAPH_OP(Subgraph, ov::snippets::op)
-        NGRAPH_OP(VectorBuffer, ov::snippets::op)
-        NGRAPH_OP(RankNormalization, ov::snippets::op)
 #ifdef SNIPPETS_DEBUG_CAPS
-        NGRAPH_OP(PerfCountBegin, ov::snippets::op)
-        NGRAPH_OP(PerfCountEnd, ov::snippets::op)
-#endif
-        NGRAPH_OP_X64(LoadConvertSaturation, ov::intel_cpu)
-        NGRAPH_OP_X64(LoadConvertTruncation, ov::intel_cpu)
-        NGRAPH_OP_X64(StoreConvertSaturation, ov::intel_cpu)
-        NGRAPH_OP_X64(StoreConvertTruncation, ov::intel_cpu)
-        NGRAPH_OP_X64(BrgemmCPU, ov::intel_cpu)
-        NGRAPH_OP_X64(BrgemmCopyB, ov::intel_cpu)
-#ifdef SNIPPETS_DEBUG_CAPS
-        NGRAPH_OP_X64(PerfCountRdtscBegin, ov::intel_cpu)
-        NGRAPH_OP_X64(PerfCountRdtscEnd, ov::intel_cpu)
+#    define SNIPPETS_DEBUG_CAPS_EXTENSIONS                   \
+        OP_EXTENSION(ov::snippets::op::PerfCountBegin)       \
+        OP_EXTENSION(ov::snippets::op::PerfCountEnd)         \
+        OP_EXTENSION_X64(ov::intel_cpu::PerfCountRdtscBegin) \
+        OP_EXTENSION_X64(ov::intel_cpu::PerfCountRdtscEnd)
+#else
+#    define SNIPPETS_DEBUG_CAPS_EXTENSIONS
 #endif
-#undef NGRAPH_OP
-
-        return opset;
-    };
-
-    static std::map<std::string, ngraph::OpSet> opsets = {
-        { "cpu_plugin_opset", cpu_plugin_opset() },
-        { "type_relaxed_opset", type_relaxed_opset() },
-        { "ie_internal_opset", ie_internal_opset() },
-        { "SnippetsOpset", snippets_opset() },
-    };
-
-    return opsets;
-}
-
-std::vector<std::string> Extension::getImplTypes(const std::shared_ptr<ov::Node>&) {
-    return {};
-}
-
-InferenceEngine::ILayerImpl::Ptr Extension::getImplementation(const std::shared_ptr<ov::Node>& node, const std::string& implType) {
-    return nullptr;
-}
-
-}   // namespace intel_cpu
-}   // namespace ov
 
-// Generate exported function
-IE_DEFINE_EXTENSION_CREATE_FUNCTION(ov::intel_cpu::Extension)
+#define SNIPPETS_EXTENSIONS                                  \
+    OP_EXTENSION(ov::snippets::op::Brgemm)                   \
+    OP_EXTENSION(ov::snippets::op::BroadcastLoad)            \
+    OP_EXTENSION(ov::snippets::op::BroadcastMove)            \
+    OP_EXTENSION(ov::snippets::op::ConvertSaturation)        \
+    OP_EXTENSION(ov::snippets::op::ConvertTruncation)        \
+    OP_EXTENSION(ov::snippets::op::Fill)                     \
+    OP_EXTENSION(ov::snippets::op::HorizonMax)               \
+    OP_EXTENSION(ov::snippets::op::HorizonSum)               \
+    OP_EXTENSION(ov::snippets::op::Kernel)                   \
+    OP_EXTENSION(ov::snippets::op::IntermediateMemoryBuffer) \
+    OP_EXTENSION(ov::snippets::op::Load)                     \
+    OP_EXTENSION(ov::snippets::op::LoadReshape)              \
+    OP_EXTENSION(ov::snippets::op::LoopBegin)                \
+    OP_EXTENSION(ov::snippets::op::LoopEnd)                  \
+    OP_EXTENSION(ov::snippets::op::NewMemoryBuffer)          \
+    OP_EXTENSION(ov::snippets::op::Nop)                      \
+    OP_EXTENSION(ov::snippets::op::PowerStatic)              \
+    OP_EXTENSION(ov::snippets::op::Scalar)                   \
+    OP_EXTENSION(ov::snippets::op::Store)                    \
+    OP_EXTENSION(ov::snippets::op::Subgraph)                 \
+    OP_EXTENSION(ov::snippets::op::VectorBuffer)             \
+    OP_EXTENSION(ov::snippets::op::RankNormalization)
+
+OPENVINO_CREATE_EXTENSIONS(std::vector<ov::Extension::Ptr>(
+    {CPU_EXTENSIONS TYPE_RELAXED_EXTENSIONS SNIPPETS_EXTENSIONS SNIPPETS_DEBUG_CAPS_EXTENSIONS}));
diff --git a/src/plugins/intel_cpu/src/extension.h b/src/plugins/intel_cpu/src/extension.h
deleted file mode 100644
index 6d36a20b38a598..00000000000000
--- a/src/plugins/intel_cpu/src/extension.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ie_iextension.h>
-
-namespace ov {
-namespace intel_cpu {
-
-class Extension : public InferenceEngine::IExtension {
-public:
-    void GetVersion(const InferenceEngine::Version*& versionInfo) const noexcept override;
-    void Unload() noexcept override;
-    std::map<std::string, ngraph::OpSet> getOpSets() override;
-    std::vector<std::string> getImplTypes(const std::shared_ptr<ov::Node>& node) override;
-    InferenceEngine::ILayerImpl::Ptr getImplementation(const std::shared_ptr<ov::Node>& node, const std::string& implType) override;
-};
-
-}   // namespace intel_cpu
-}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/extension_mngr.cpp b/src/plugins/intel_cpu/src/extension_mngr.cpp
deleted file mode 100644
index d842f227fb5590..00000000000000
--- a/src/plugins/intel_cpu/src/extension_mngr.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include <vector>
-#include <string>
-#include <algorithm>
-
-#include "extension_mngr.h"
-
-using namespace InferenceEngine;
-
-namespace ov {
-namespace intel_cpu {
-
-void ExtensionManager::AddExtension(const IExtensionPtr& extension) {
-    _extensions.push_back(extension);
-}
-
-InferenceEngine::ILayerImpl::Ptr ExtensionManager::CreateImplementation(const std::shared_ptr<ov::Node>& op) {
-    if (!op)
-        OPENVINO_THROW("Cannot get nGraph operation!");
-    for (const auto& ext : _extensions) {
-        auto implTypes = ext->getImplTypes(op);
-        for (const auto& type : implTypes) {
-            if (type != "CPU")
-                continue;
-            auto impl = ext->getImplementation(op, "CPU");
-            if (impl)
-                return impl;
-        }
-    }
-    return nullptr;
-}
-
-const std::vector<InferenceEngine::IExtensionPtr> & ExtensionManager::Extensions() const {
-    return _extensions;
-}
-
-}   // namespace intel_cpu
-}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/extension_mngr.h b/src/plugins/intel_cpu/src/extension_mngr.h
deleted file mode 100644
index 67505861bcdd75..00000000000000
--- a/src/plugins/intel_cpu/src/extension_mngr.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include <memory>
-#include <ie_iextension.h>
-
-namespace ov {
-namespace intel_cpu {
-
-class ExtensionManager {
-public:
-    using Ptr = std::shared_ptr<ExtensionManager>;
-    ExtensionManager() = default;
-    InferenceEngine::ILayerImpl::Ptr CreateImplementation(const std::shared_ptr<ov::Node>& op);
-    void AddExtension(const InferenceEngine::IExtensionPtr& extension);
-    const std::vector<InferenceEngine::IExtensionPtr> & Extensions() const;
-
-private:
-    std::vector<InferenceEngine::IExtensionPtr> _extensions;
-};
-
-}   // namespace intel_cpu
-}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/graph_context.h b/src/plugins/intel_cpu/src/graph_context.h
index 2d90d039ba1489..3fc8f7e40fbf21 100644
--- a/src/plugins/intel_cpu/src/graph_context.h
+++ b/src/plugins/intel_cpu/src/graph_context.h
@@ -7,7 +7,6 @@
 #include "cache/multi_cache.h"
 #include "config.h"
 #include "dnnl_scratch_pad.h"
-#include "extension_mngr.h"
 #include "weights_cache.hpp"
 
 namespace ov {
@@ -19,11 +18,9 @@ class GraphContext {
     typedef std::shared_ptr<const GraphContext> CPtr;
 
     GraphContext(const Config& config,
-                 ExtensionManager::Ptr extensionManager,
                  WeightsSharing::Ptr w_cache,
                  bool isGraphQuantized)
         : config(config),
-          extensionManager(extensionManager),
           weightsCache(w_cache),
           isGraphQuantizedFlag(isGraphQuantized) {
         rtParamsCache = std::make_shared<MultiCache>(config.rtCacheCapacity);
@@ -34,10 +31,6 @@ class GraphContext {
         return config;
     }
 
-    ExtensionManager::Ptr getExtensionManager() const {
-        return extensionManager;
-    }
-
     WeightsSharing::Ptr getWeightsCache() const {
         return weightsCache;
     }
@@ -60,7 +53,6 @@ class GraphContext {
 private:
     Config config;  // network-level config
 
-    ExtensionManager::Ptr extensionManager;
     WeightsSharing::Ptr weightsCache;         // per NUMA node caches for sharing weights data
 
     MultiCachePtr rtParamsCache;     // primitive cache
diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
index de686d0ff3185b..660d2b8d6bd7b2 100644
--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@@ -10,7 +10,6 @@
 #include "dnnl_extension_utils.h"
 #include "dnnl_types.h"
 #include "edge.h"
-#include "extension_mngr.h"
 #include "itt.h"
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "memory_desc/dnnl_blocked_memory_desc.h"
@@ -1295,7 +1294,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ov::Node>& op, const Grap
     if (newNode == nullptr) {
         try {
             std::unique_ptr<Node> ol(createNodeIfRegistered(intel_cpu, TypeFromName(op->get_type_name()), op, context));
-            if (ol != nullptr && ol->created(context->getExtensionManager()))
+            if (ol != nullptr && ol->created())
                 newNode = ol.release();
         } catch (const ov::Exception& ex) {
             if (dynamic_cast<const ov::NotImplemented*>(&ex) != nullptr) {
@@ -1309,7 +1308,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ov::Node>& op, const Grap
     if (newNode == nullptr) {
         try {
             std::unique_ptr<Node> ol(new Reference(op, context, errorMessage));
-            if (ol != nullptr && ol->created(context->getExtensionManager()))
+            if (ol != nullptr && ol->created())
                 newNode = ol.release();
         } catch (const ov::Exception& ex) {
             if (dynamic_cast<const ov::NotImplemented*>(&ex) != nullptr) {
diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h
index 7601d09c4cd0d1..da529fbefacde7 100644
--- a/src/plugins/intel_cpu/src/node.h
+++ b/src/plugins/intel_cpu/src/node.h
@@ -12,8 +12,6 @@
 #include "dnnl_postops_composer.h"
 #include "dnnl_scratch_pad.h"
 #include "edge.h"
-#include "extension_mngr.h"
-#include "graph_context.h"
 #include "nodes/common/blocked_desc_creator.h"
 #include "nodes/executors/executor.hpp"
 #include "nodes/executors/mvn_list.hpp"
@@ -410,9 +408,6 @@ class Node {
                                   const std::vector<MemoryDescPtr>& outputDesc) {}
     virtual void initDescriptor(const NodeConfig& config);
     virtual bool created() const = 0;
-    virtual bool created(const ExtensionManager::Ptr& extMgr) {
-        return created();
-    }
 
     /**
      * @brief Performs Node initialization based on graph context.
diff --git a/src/plugins/intel_cpu/src/nodes/if.h b/src/plugins/intel_cpu/src/nodes/if.h
index 76a87874209984..ff41bd2a8c6dbb 100644
--- a/src/plugins/intel_cpu/src/nodes/if.h
+++ b/src/plugins/intel_cpu/src/nodes/if.h
@@ -59,7 +59,6 @@ class If : public Node {
         ptrdiff_t size;
     };
 
-    ExtensionManager::Ptr ext_mng;
     Graph subGraphThen;
     Graph subGraphElse;
     std::vector<std::deque<MemoryPtr>> inputMemThen, inputMemElse;
diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.h b/src/plugins/intel_cpu/src/nodes/tensoriterator.h
index 104ee077f9a163..07a1c0106b799b 100644
--- a/src/plugins/intel_cpu/src/nodes/tensoriterator.h
+++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.h
@@ -140,7 +140,6 @@ class TensorIterator : public Node {
     int getNumIteration(const std::vector<PortMap>& inputPortMap, const std::vector<PortMap>& outputPortMap) const;
     bool runAsDynamic() const;
 
-    ExtensionManager::Ptr ext_mng;
     Graph sub_graph;
     std::vector<std::vector<MemoryPtr>> input_mems;
     std::vector<MemoryPtr> output_mem;
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index e9a64a821431a6..0313e5ed3c4a8b 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -4,8 +4,6 @@
 
 #include "plugin.h"
 
-#include "extension.h"
-#include "extension_mngr.h"
 #include "itt.h"
 #include "internal_properties.hpp"
 #include "openvino/runtime/intel_cpu/properties.hpp"
@@ -171,7 +169,6 @@ Engine::Engine() :
     get_executor_manager()->execute_task_by_streams_executor(IStreamsExecutor::Config::PreferredCoreType::BIG, [] {
         dnnl::impl::cpu::x64::cpu();
     });
-    extensionManager->AddExtension(std::make_shared<Extension>());
 #if defined(OV_CPU_WITH_ACL)
     scheduler_guard = SchedulerGuard::instance();
 #endif
@@ -614,7 +611,7 @@ Engine::compile_model(const std::shared_ptr<const ov::Model>& model, const ov::A
             denormals_as_zero(false);
         }
     }
-    return std::make_shared<CompiledModel>(cloned_model, shared_from_this(), conf, extensionManager);
+    return std::make_shared<CompiledModel>(cloned_model, shared_from_this(), conf);
 }
 
 void Engine::set_property(const ov::AnyMap &config) {
@@ -868,12 +865,6 @@ ov::Any Engine::get_ro_property(const std::string& name, const ov::AnyMap& optio
     OPENVINO_THROW("Cannot get unsupported property: ", name);
 }
 
-OPENVINO_SUPPRESS_DEPRECATED_START
-void Engine::add_extension(const InferenceEngine::IExtensionPtr& extension) {
-    extensionManager->AddExtension(extension);
-}
-OPENVINO_SUPPRESS_DEPRECATED_END
-
 ov::SupportedOpsMap Engine::query_model(const std::shared_ptr<const ov::Model>& model, const ov::AnyMap& config) const {
     WeightsSharing::Ptr fake_w_cache;
 
@@ -892,7 +883,7 @@ ov::SupportedOpsMap Engine::query_model(const std::shared_ptr<const ov::Model>&
     const Config::SnippetsMode snippetsMode = getSnippetsMode(config, conf);
 
     auto context =
-        std::make_shared<GraphContext>(conf, extensionManager, fake_w_cache, false);
+        std::make_shared<GraphContext>(conf, fake_w_cache, false);
 
     auto supported = ov::get_supported_nodes(
         model,
@@ -945,7 +936,7 @@ std::shared_ptr<ov::ICompiledModel> Engine::import_model(std::istream& networkMo
     // import config props from caching model
     calculate_streams(conf, model, true);
 
-    auto compiled_model = std::make_shared<CompiledModel>(model, shared_from_this(), conf, extensionManager, true);
+    auto compiled_model = std::make_shared<CompiledModel>(model, shared_from_this(), conf, true);
     return compiled_model;
 }
 }   // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/plugin.h b/src/plugins/intel_cpu/src/plugin.h
index 256eafdbadbaab..756387aa48a13d 100644
--- a/src/plugins/intel_cpu/src/plugin.h
+++ b/src/plugins/intel_cpu/src/plugin.h
@@ -43,10 +43,6 @@ class Engine : public ov::IPlugin {
         OPENVINO_THROW_NOT_IMPLEMENTED("Not Implemented get_default_context  is not supported by CPU plugin!");
     };
 
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    void add_extension(const std::shared_ptr<InferenceEngine::IExtension>& extension) override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
-
 private:
     bool is_legacy_api() const;
 
@@ -62,7 +58,6 @@ class Engine : public ov::IPlugin {
     void calculate_streams(Config& conf, const std::shared_ptr<ov::Model>& model, bool imported = false) const;
 
     Config engConfig;
-    ExtensionManager::Ptr extensionManager = std::make_shared<ExtensionManager>();
     /* Explicily configured streams have higher priority than performance hints.
        So track if streams is set explicitly (not auto-configured) */
     bool streamsExplicitlySetForEngine = false;
diff --git a/src/plugins/intel_cpu/src/serialize.cpp b/src/plugins/intel_cpu/src/serialize.cpp
index ce7304de14af9a..777d7ea8a04ecc 100644
--- a/src/plugins/intel_cpu/src/serialize.cpp
+++ b/src/plugins/intel_cpu/src/serialize.cpp
@@ -24,27 +24,10 @@ static void setInfo(pugi::xml_node& root, std::shared_ptr<ov::Model>& model) {
     }
 }
 
-ModelSerializer::ModelSerializer(std::ostream & ostream, ExtensionManager::Ptr extensionManager)
-    : _ostream(ostream)
-    , _extensionManager(extensionManager) {
-}
+ModelSerializer::ModelSerializer(std::ostream& ostream) : _ostream(ostream) {}
 
 void ModelSerializer::operator<<(const std::shared_ptr<ov::Model>& model) {
     OPENVINO_SUPPRESS_DEPRECATED_START
-    auto getCustomOpSets = [this]() {
-        std::map<std::string, ngraph::OpSet> custom_opsets;
-
-        if (_extensionManager) {
-            auto extensions = _extensionManager->Extensions();
-            for (const auto& extension : extensions) {
-                auto opset = extension->getOpSets();
-                custom_opsets.insert(std::begin(opset), std::end(opset));
-            }
-        }
-
-        return custom_opsets;
-    };
-
     auto serializeInfo = [&](std::ostream& stream) {
         const std::string name = "cnndata";
         pugi::xml_document xml_doc;
@@ -59,7 +42,7 @@ void ModelSerializer::operator<<(const std::shared_ptr<ov::Model>& model) {
     };
 
     // Serialize to old representation in case of old API
-    ov::pass::StreamSerialize serializer(_ostream, getCustomOpSets(), serializeInfo);
+    ov::pass::StreamSerialize serializer(_ostream, serializeInfo);
     OPENVINO_SUPPRESS_DEPRECATED_END
     serializer.run_on_model(std::const_pointer_cast<ov::Model>(model->clone()));
 }
diff --git a/src/plugins/intel_cpu/src/serialize.h b/src/plugins/intel_cpu/src/serialize.h
index 5bbb22661003c7..b0c57a7ea9d91a 100644
--- a/src/plugins/intel_cpu/src/serialize.h
+++ b/src/plugins/intel_cpu/src/serialize.h
@@ -6,19 +6,17 @@
 #include <iostream>
 
 #include "cpp/ie_cnn_network.h"
-#include "extension_mngr.h"
 
 namespace ov {
 namespace intel_cpu {
 
 class ModelSerializer {
 public:
-    ModelSerializer(std::ostream& ostream, ExtensionManager::Ptr extensionManager);
+    ModelSerializer(std::ostream& ostream);
     void operator<<(const std::shared_ptr<ov::Model>& model);
 
 private:
     std::ostream& _ostream;
-    ExtensionManager::Ptr _extensionManager;
 };
 
 class ModelDeserializer {
diff --git a/src/plugins/intel_cpu/src/utils/ngraph_transformation.hpp b/src/plugins/intel_cpu/src/utils/ngraph_transformation.hpp
index 47466dbab52052..f7e9ecf30dbd4d 100644
--- a/src/plugins/intel_cpu/src/utils/ngraph_transformation.hpp
+++ b/src/plugins/intel_cpu/src/utils/ngraph_transformation.hpp
@@ -4,7 +4,6 @@
 #pragma once
 #ifdef CPU_DEBUG_CAPS
 
-#include "extension.h"
 #include "debug_caps_config.h"
 #include "openvino/util/file_util.hpp"
 #include <openvino/pass/manager.hpp>
@@ -68,8 +67,7 @@ class TransformationDumper {
         ov::pass::Manager serializer;
 
         if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::XmlBin]) {
-            auto custom_opsets = std::make_shared<Extension>()->getOpSets();
-            serializer.register_pass<ov::pass::Serialize>(pathAndName + ".xml", "", custom_opsets);
+            serializer.register_pass<ov::pass::Serialize>(pathAndName + ".xml", "");
         }
 
         if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::Xml]) {
diff --git a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp
index 3b2c070ac3579e..e98d5ba1f85bb3 100644
--- a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp
+++ b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp
@@ -79,7 +79,7 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) {
 
         Config conf;
         conf.rtCacheCapacity = 0;
-        auto context = std::make_shared<GraphContext>(conf, nullptr, nullptr, false);
+        auto context = std::make_shared<GraphContext>(conf, nullptr, false);
 
         auto input_node = std::make_shared<node::Input>(param, context);
         auto memory_input = std::make_shared<node::MemoryInput>(read, context);
@@ -266,7 +266,7 @@ TEST(MemStateGraphTest, smoke_ShapeOf_no_Inplace_Conflicts) {
 
     Config conf;
     conf.rtCacheCapacity = 0;
-    auto context = std::make_shared<GraphContext>(conf, nullptr, nullptr, false);
+    auto context = std::make_shared<GraphContext>(conf, nullptr, false);
 
     auto input_node = std::make_shared<node::Input>(param, context);
     auto memory_input = std::make_shared<node::MemoryInput>(read, context);
diff --git a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp
index b765c8e40e283e..fe43a4301a0268 100644
--- a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp
@@ -60,7 +60,7 @@ class MergeTransposeReorderIsOptimizedCPUTest : public ::testing::Test {
         //
         Config conf;
         conf.rtCacheCapacity = 100;
-        auto context = std::make_shared<GraphContext>(conf, nullptr, nullptr, false);
+        auto context = std::make_shared<GraphContext>(conf, nullptr, false);
         const dnnl::engine cpuEngine = context->getEngine();
 
         m_graph = std::unique_ptr<Graph>(new Graph());
diff --git a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp
index 2c048e5e13b0e0..1ca1558a0a3d28 100644
--- a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp
@@ -36,7 +36,7 @@ TEST(ResolveEdgeConflictsCPUTest, smoke_Run_ResolveEdgeConflicts) {
     */
     Config conf;
     conf.rtCacheCapacity = 100;
-    auto context = std::make_shared<GraphContext>(conf, nullptr, nullptr, false);
+    auto context = std::make_shared<GraphContext>(conf, nullptr, false);
     const dnnl::engine cpuEngine = context->getEngine();
 
     std::unique_ptr<Graph> graph = std::unique_ptr<Graph>(new Graph());
diff --git a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp
index 5dd1e01f5610a9..ff528d8e65dc74 100644
--- a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp
@@ -108,7 +108,6 @@ class ReorderCPUTestGraph {
         Config conf;
         conf.rtCacheCapacity = 100;
         auto context = std::make_shared<GraphContext>(conf,
-                                                      nullptr,
                                                       std::make_shared<WeightsSharing>(),
                                                       false);
         const dnnl::engine cpuEngine = context->getEngine();

From 53d7c501bd49209a8fc8a5f1cbfe1619c81312f3 Mon Sep 17 00:00:00 2001
From: Karol Blaszczak <karol.blaszczak@intel.com>
Date: Mon, 15 Jan 2024 08:20:07 +0100
Subject: [PATCH 10/13] [DOCS] torchvision and optimization section (#22043)

---
 .../openvino_workflow/model_preparation.rst   |  1 -
 .../model_preparation/pytorch_vision.rst      | 12 ----
 .../dldt_deployment_optimization_guide.rst    |  5 +-
 .../dldt_deployment_optimization_tput.rst     | 11 ++-
 ..._deployment_optimization_tput_advanced.rst |  4 +-
 .../precision_control.rst                     |  5 +-
 .../preprocessing_overview.rst                |  1 +
 .../torchvision_preprocessing_converter.rst   | 71 +++++++++++++++++++
 docs/dev/get_started.md                       |  2 +-
 9 files changed, 89 insertions(+), 23 deletions(-)
 delete mode 100644 docs/articles_en/openvino_workflow/model_preparation/pytorch_vision.rst
 create mode 100644 docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/preprocessing_overview/torchvision_preprocessing_converter.rst

diff --git a/docs/articles_en/openvino_workflow/model_preparation.rst b/docs/articles_en/openvino_workflow/model_preparation.rst
index f9bc468066669c..a70d79c5233ad6 100644
--- a/docs/articles_en/openvino_workflow/model_preparation.rst
+++ b/docs/articles_en/openvino_workflow/model_preparation.rst
@@ -15,7 +15,6 @@ Model Preparation
    Convert to OpenVINO Model <openvino_docs_OV_Converter_UG_prepare_model_convert_model_Convert_Model_IR>
    Conversion Parameters <openvino_docs_OV_Converter_UG_Conversion_Options>
    Setting Input Shapes <openvino_docs_OV_Converter_UG_prepare_model_convert_model_Converting_Model>
-   PyVision preprocessing <pytorch_vision>
 
 
 You can obtain a model in one of supported formats, **PyTorch, TensorFlow, TensorFlow Lite, ONNX, and PaddlePaddle**,
diff --git a/docs/articles_en/openvino_workflow/model_preparation/pytorch_vision.rst b/docs/articles_en/openvino_workflow/model_preparation/pytorch_vision.rst
deleted file mode 100644
index 34df9e465e022e..00000000000000
--- a/docs/articles_en/openvino_workflow/model_preparation/pytorch_vision.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-.. {#pytorch_vision}
-
-PyVision
-=======================
-
-
-.. meta::
-   :description: Learn about supported model formats and the methods used to convert, read, and compile them in OpenVINO™.
-
-Images input to AI models often need to be preprocessed in order to have proper dimensions or data type. 
-Instead of doing it with another library in an additional pipeline step, you can use torchvision.transforms OpenVINO feature. 
-It automatically translates a torchvision preprocessing pipeline to OpenVINO operators and then embeds them into your OpenVINO model, reducing overall program complexity and allowing additional performance optimizations to take place.
\ No newline at end of file
diff --git a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide.rst b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide.rst
index 2628e8be39cb24..5c389e3d789aa5 100644
--- a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide.rst
+++ b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide.rst
@@ -13,10 +13,11 @@ Optimize Inference
    openvino_docs_OV_UG_Precision_Control
    openvino_docs_deployment_optimization_guide_latency
    openvino_docs_deployment_optimization_guide_tput
-   openvino_docs_deployment_optimization_guide_tput_advanced
+   Advanced Throughput Options <openvino_docs_deployment_optimization_guide_tput_advanced>
    openvino_docs_OV_UG_Preprocessing_Overview
    openvino_docs_deployment_optimization_guide_internals
-   openvino_docs_memory_optimization_guide
+   Optimizing memory usage <openvino_docs_memory_optimization_guide>
+   
 
 .. meta::
    :description: Improving inference performance involves model and runtime 
diff --git a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/dldt_deployment_optimization_tput.rst b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/dldt_deployment_optimization_tput.rst
index 3f1595189acb8c..b26d0c9e6132ad 100644
--- a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/dldt_deployment_optimization_tput.rst
+++ b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/dldt_deployment_optimization_tput.rst
@@ -10,8 +10,15 @@ Optimizing for Throughput
                  simultaneously which improves the device utilization.
 
 
-As described in the section on the :doc:`latency-specific considerations <openvino_docs_deployment_optimization_guide_latency>`, one of the possible use cases is *delivering every single request at the minimal delay*.
-Throughput, on the other hand, is about inference scenarios in which potentially **large number of inference requests are served simultaneously to improve the device utilization**.
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   Advanced Throughput Options <openvino_docs_deployment_optimization_guide_tput_advanced>
+
+
+As described in the section on the :doc:`latency-specific optimizations <openvino_docs_deployment_optimization_guide_latency>`, one of the possible use cases is delivering every single request with minimal delay.
+Throughput, on the other hand, is about inference scenarios in which potentially **large numbers of inference requests are served simultaneously to improve resource use**.
 
 The associated increase in latency is not linearly dependent on the number of requests executed in parallel.
 A trade-off between overall throughput and serial performance of individual requests can be achieved with the right performance configuration of OpenVINO.
diff --git a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/dldt_deployment_optimization_tput_advanced.rst b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/dldt_deployment_optimization_tput_advanced.rst
index eff5a21ae1c202..b792cfca42be33 100644
--- a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/dldt_deployment_optimization_tput_advanced.rst
+++ b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/dldt_deployment_optimization_tput_advanced.rst
@@ -1,6 +1,6 @@
 .. {#openvino_docs_deployment_optimization_guide_tput_advanced}
 
-Using Advanced Throughput Options: Streams and Batching
+Advanced Throughput Options: Streams and Batching
 =======================================================
 
 
@@ -8,7 +8,7 @@ Using Advanced Throughput Options: Streams and Batching
    :description: With OpenVINO streams a device may handle processing multiple 
                  inference requests and the batching helps to saturate the 
                  device and leads to higher throughput.
-
+ 
 
 OpenVINO Streams
 ####################
diff --git a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/precision_control.rst b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/precision_control.rst
index 91ae9b011f5746..63de6309809943 100644
--- a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/precision_control.rst
+++ b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/precision_control.rst
@@ -9,9 +9,8 @@ The choice of data types is essential to the inference runtime, which can have a
 1. Model storage precision (IR precision),
 2. Model inference precision.
 
-Previously, these 2 precisions were interrelated, and model storage precision could affect the inference precision in some devices (e.g. GPU did ``f16`` inference only for ``f16`` IRs).
-
-With the ``2023.0`` release this behavior has been changed and the inference precision no longer depends on the precision of IR. Now users have several knobs to find the balance between model performance and accuracy.
+Inference precision no longer depends on the precision of IR, which means that users
+have several options to find the balance between model performance and accuracy.
 
 Essentially, the IR precision becomes a way of compressing the model by reducing the precision of the weights, and it does not affect how the devices execute the model. This change clears up a lot of confusion where, for example, you couldn't execute a high-performance model on the GPU by default, and the behavior between devices was different. 
 
diff --git a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/preprocessing_overview.rst b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/preprocessing_overview.rst
index bac4584fb4cf99..48020d3928de0c 100644
--- a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/preprocessing_overview.rst
+++ b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/preprocessing_overview.rst
@@ -11,6 +11,7 @@ Optimize Preprocessing
    openvino_docs_OV_UG_Preprocessing_Details
    openvino_docs_OV_UG_Layout_Overview
    openvino_docs_OV_UG_Preprocess_Usecase_save
+   Torchvision preprocessing converter <torchvision_preprocessing_converter>
 
 .. meta::
    :description: The preprocessing entails additional operations to transform 
diff --git a/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/preprocessing_overview/torchvision_preprocessing_converter.rst b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/preprocessing_overview/torchvision_preprocessing_converter.rst
new file mode 100644
index 00000000000000..264edda073b2d6
--- /dev/null
+++ b/docs/articles_en/openvino_workflow/running_inference_with_openvino/dldt_deployment_optimization_guide/preprocessing_overview/torchvision_preprocessing_converter.rst
@@ -0,0 +1,71 @@
+.. {#torchvision_preprocessing_converter}
+
+Torchvision preprocessing converter
+=======================================
+
+
+.. meta::
+   :description: See how OpenVINO™ enables torchvision preprocessing
+                 to optimize model inference.
+
+
+The Torchvision-to-OpenVINO converter enables automatic translation of operators from the torchvision
+preprocessing pipeline to the OpenVINO format and embed them in your model. It is often used to adjust
+images serving as input for AI models to have proper dimensions or data types.
+
+As the converter is fully based on the **openvino.preprocess** module, you can implement the **torchvision.transforms**
+feature easily and without the use of external libraries, reducing the overall application complexity
+and enabling additional performance optimizations.
+
+
+.. note::
+
+   Not all torchvision transforms are supported yet. The following operations are available:
+
+   .. code-block::
+
+      transforms.Compose
+      transforms.Normalize
+      transforms.ConvertImageDtype
+      transforms.Grayscale
+      transforms.Pad
+      transforms.ToTensor
+      transforms.CenterCrop
+      transforms.Resize
+
+
+Example
+###################
+
+.. code-block:: py
+
+   preprocess_pipeline = torchvision.transforms.Compose(
+       [
+           torchvision.transforms.Resize(256, interpolation=transforms.InterpolationMode.NEAREST),
+           torchvision.transforms.CenterCrop((216, 218)),
+           torchvision.transforms.Pad((2, 3, 4, 5), fill=3),
+           torchvision.transforms.ToTensor(),
+           torchvision.transforms.ConvertImageDtype(torch.float32),
+           torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+       ]
+   )
+
+   torch_model = SimpleConvnet(input_channels=3)
+
+   torch.onnx.export(torch_model, torch.randn(1, 3, 224, 224), "test_convnet.onnx", verbose=False, input_names=["input"], output_names=["output"])
+   core = Core()
+   ov_model = core.read_model(model="test_convnet.onnx")
+
+   test_input = np.random.randint(255, size=(260, 260, 3), dtype=np.uint16)
+   ov_model = PreprocessConverter.from_torchvision(
+       model=ov_model, transform=preprocess_pipeline, input_example=Image.fromarray(test_input.astype("uint8"), "RGB")
+   )
+   ov_model = core.compile_model(ov_model, "CPU")
+   ov_input = np.expand_dims(test_input, axis=0)
+   output = ov_model.output(0)
+   ov_result = ov_model(ov_input)[output]
+
+
+
+
+
diff --git a/docs/dev/get_started.md b/docs/dev/get_started.md
index 93b6000dbee1b1..b5d09af43dd7cb 100644
--- a/docs/dev/get_started.md
+++ b/docs/dev/get_started.md
@@ -18,4 +18,4 @@ Explore other resources to learn more about OpenVINO:
  * [OpenVINO Developer Documentation](./index.md)
  * [OpenVINO Samples](../../samples)
  * [OpenVINO Building Documentation](./building_documentation.md)
- * [CMake Options for Custom Compilation](./cmake_options_for_custom_comiplation.md)
+ * [CMake Options for Custom Compilation](./cmake_options_for_custom_compilation.md)

From 938600fbf7aba0c836e12787b0ce037dae7ecc8c Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 15 Jan 2024 09:57:24 +0100
Subject: [PATCH 11/13] [tests] resolve HF models 4th batch (#21907)

* add inspect pytorch modules

* add infer for each submodule

* skip -> xfail in several models

* successfull separate modules execution

* revert jukebox, whisper

* ready for rewiew

* rely on tags insted of names

* fix import error
---
 .../torch_tests/hf_transformers_models        | 11 +++--
 .../torch_tests/test_hf_transformers.py       | 44 ++++++++++++++++++-
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/tests/model_hub_tests/torch_tests/hf_transformers_models b/tests/model_hub_tests/torch_tests/hf_transformers_models
index cec6509471c141..2cec1a1b744901 100644
--- a/tests/model_hub_tests/torch_tests/hf_transformers_models
+++ b/tests/model_hub_tests/torch_tests/hf_transformers_models
@@ -209,7 +209,8 @@ krasserm/perceiver-io-mlm,perceiver-io-masked-language-model,skip,Load problem
 krasserm/perceiver-io-optical-flow,perceiver-io-optical-flow,skip,Load problem
 krasserm/perceiver-io-txt-clf-imdb,perceiver-io-text-classifier,skip,Load problem
 ksmcg/fan_small_12_p16_224,fan,skip,Load problem
-laion/clap-htsat-unfused,clap,skip,Load problem
+laion/clap-htsat-unfused:audio_model,clap
+laion/clap-htsat-unfused:audio_projection,clap
 Langboat/ReGPT-125M-200G,re_gpt,skip,Load problem
 lengyue233/content-vec-best,hubert
 Lewislou/cellseg_sribd,cell_sribd,skip,Load problem
@@ -241,7 +242,7 @@ microsoft/beit-base-patch16-224-pt22k-ft22k,beit
 microsoft/biogpt,biogpt
 microsoft/conditional-detr-resnet-50,conditional_detr
 microsoft/deberta-base,deberta
-microsoft/git-large-coco,git,skip,Load problem
+microsoft/git-large-coco,git,xfail,Tracing error: Please check correctness of provided example_input (but eval was correct)
 microsoft/layoutlm-base-uncased,layoutlm
 microsoft/layoutlmv2-base-uncased,layoutlmv2,xfail,Tracing error: Please check correctness of provided example_input (but eval was correct)
 microsoft/layoutlmv3-base,layoutlmv3
@@ -316,7 +317,9 @@ RWKV/rwkv-4-169m-pile,rwkv
 sahasrarjn/interbert,BERT,skip,Load problem
 saibo/genkalm-medium-gpt2,genkalm,skip,Load problem
 SajjadAyoubi/clip-fa-vision,clip_vision_model
-Salesforce/blip2-flan-t5-xl,blip-2,skip,Load problem
+Salesforce/blip2-flan-t5-xl:vision_model,blip-2
+Salesforce/blip2-flan-t5-xl:qformer,blip-2
+Salesforce/blip2-flan-t5-xl:language_projection,blip-2
 Salesforce/blip-image-captioning-large,blip
 Salesforce/instructblip-vicuna-7b,instructblip,skip,Load problem
 SamLowe/roberta-base-go_emotions,roberta
@@ -410,5 +413,5 @@ Yova/SmallCapOPT7M,smallcap,skip,Load problem
 yusufani/trclip-vitl14-e10,trclip,skip,Load problem
 yysung53/dpr,text_similarity,skip,Load problem
 Zetatech/pvt-tiny-224,pvt,skip,Load problem
-ZinengTang/tvlt-base,tvlt,skip,Load problem
+ZinengTang/tvlt-base,tvlt,xfail,Conversion is failed for aten::cat: Argument element types are inconsistent
 zuppif/resnetd-18,resnetd,skip,Load problem
diff --git a/tests/model_hub_tests/torch_tests/test_hf_transformers.py b/tests/model_hub_tests/torch_tests/test_hf_transformers.py
index a47730d5ba1aab..1e8c3d76983120 100644
--- a/tests/model_hub_tests/torch_tests/test_hf_transformers.py
+++ b/tests/model_hub_tests/torch_tests/test_hf_transformers.py
@@ -99,7 +99,13 @@ def setup_class(self):
         self.cuda_available, self.gptq_postinit = None, None
 
     def load_model(self, name, type):
+        import torch
+        name_suffix = ''
         from transformers import AutoConfig
+        if name.find(':') != -1:
+            name_suffix = name[name.find(':') + 1:]
+            name = name[:name.find(':')]
+
         mi = model_info(name)
         auto_processor = None
         model = None
@@ -163,6 +169,41 @@ def load_model(self, name, type):
             processor = AutoProcessor.from_pretrained(name)
             model = AutoModel.from_pretrained(name, **model_kwargs)
             example = dict(processor(images=self.image, task_inputs=["semantic"], return_tensors="pt"))
+        elif 'clap' in mi.tags:
+            from transformers import AutoModel
+            model = AutoModel.from_pretrained(name)
+            
+            import torch
+            example_inputs_map = {
+                'audio_model': {'input_features': torch.randn([1, 1, 1001, 64], dtype=torch.float32)},
+                'audio_projection': {'hidden_states': torch.randn([1, 768], dtype=torch.float32)},
+            }
+            model = model._modules[name_suffix]
+            example = example_inputs_map[name_suffix]
+        elif 'git' in mi.tags:
+            from transformers import AutoProcessor, AutoModelForCausalLM
+            processor = AutoProcessor.from_pretrained(name)
+            model = AutoModelForCausalLM.from_pretrained(name)
+            import torch
+            example = {'pixel_values': torch.randn(*(1, 3, 224, 224), dtype=torch.float32), 
+                       'input_ids': torch.randint(1, 100, size=(1, 13), dtype=torch.int64)}
+        elif 'blip-2' in mi.tags:
+            from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
+
+            processor = AutoProcessor.from_pretrained(name)
+            model = AutoModelForVisualQuestionAnswering.from_pretrained(name)
+
+            example = dict(processor(images=self.image, return_tensors="pt"))
+            import torch
+            example_inputs_map = {
+                'vision_model' :  {'pixel_values': torch.randn([1, 3, 224, 224], dtype=torch.float32)},
+                'qformer': {'query_embeds' : torch.randn([1, 32, 768], dtype=torch.float32), 
+                            'encoder_hidden_states' : torch.randn([1, 257, 1408], dtype=torch.float32),
+                            'encoder_attention_mask' : torch.ones([1, 257], dtype=torch.int64)},
+                'language_projection': {'input' : torch.randn([1, 32, 768], dtype=torch.float32)},
+            }
+            model = model._modules[name_suffix]
+            example = example_inputs_map[name_suffix]
         elif "t5" in mi.tags:
             from transformers import T5Tokenizer
             tokenizer = T5Tokenizer.from_pretrained(name)
@@ -257,6 +298,7 @@ def forward(self, pixel_values, input_ids, attention_mask):
         elif 'speecht5' in mi.tags:
             from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
             from datasets import load_dataset
+
             processor = SpeechT5Processor.from_pretrained(name)
             model = SpeechT5ForTextToSpeech.from_pretrained(name)
 
@@ -264,7 +306,7 @@ def forward(self, pixel_values, input_ids, attention_mask):
             # load xvector containing speaker's voice characteristics from a dataset
             embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
             speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-            
+
             example = {'input_ids': inputs["input_ids"], 'speaker_embeddings': speaker_embeddings}
             class DecoratorModelForSeq2SeqLM(torch.nn.Module):
                 def __init__(self, model):

From b5b53e1749296ee3dbd5774400df302bfe37786a Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 15 Jan 2024 12:05:24 +0100
Subject: [PATCH 12/13] Keep compressed constants produced by FW (#22095)

* keep FW 16 bit float constants

* add layer tests

* remove leftovers from onnx, pdpd

* rename to MarkCompressedFloatConstants

* remove VisualizeTree

* added explanation why we need MarkCompressedFloatConstants
---
 ...decompression_convert_constant_folding.hpp | 14 ++++
 ...decompression_convert_constant_folding.cpp | 27 +++++++
 .../compress_float_constants_test.cpp         | 81 +++++++++++++++++++
 src/frontends/pytorch/src/frontend.cpp        |  5 ++
 src/frontends/tensorflow/src/frontend.cpp     |  5 ++
 .../tensorflow_lite/src/frontend.cpp          |  4 +
 .../ovc_python_api_tests/test_pytorch.py      | 22 +++++
 .../ovc_python_api_tests/test_tf.py           | 23 ++++++
 8 files changed, 181 insertions(+)

diff --git a/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp b/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp
index 7576abfac15e9e..037efc27c2f36e 100644
--- a/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp
+++ b/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp
@@ -15,6 +15,7 @@ class TRANSFORMATIONS_API EnableDecompressionConvertConstantFolding;
 class TRANSFORMATIONS_API DisableDecompressionConvertConstantFolding;
 class TRANSFORMATIONS_API KeepConstAndDecompression;
 class TRANSFORMATIONS_API KeepConstantsPrecisionAndAddConverts;
+class TRANSFORMATIONS_API MarkCompressedFloatConstants;
 
 }  // namespace pass
 }  // namespace ov
@@ -58,3 +59,16 @@ class ov::pass::KeepConstantsPrecisionAndAddConverts : public MatcherPass {
     OPENVINO_RTTI("KeepConstantsPrecisionAndAddConverts", "0");
     KeepConstantsPrecisionAndAddConverts();
 };
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief Prevents ConstantFolding for f16/bf16 Const + Convert_To_FP32 to keep original FW float Constants.
+ * Original precision should be kept as long as possible, this prevents redundant conversions and saves memory.
+ * E.g. if original FW model was already compressed no need to upcast during CF, store intermediate f32 consts and
+ * then again compress them to f16 during save_model.
+ */
+class ov::pass::MarkCompressedFloatConstants : public MatcherPass {
+public:
+    OPENVINO_RTTI("KeepFWPrecisionFor16BitFloatConstants", "0");
+    MarkCompressedFloatConstants();
+};
diff --git a/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp b/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp
index 26505dee5278b8..894e8224d5ead6 100644
--- a/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp
+++ b/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp
@@ -119,3 +119,30 @@ pass::KeepConstantsPrecisionAndAddConverts::KeepConstantsPrecisionAndAddConverts
     auto m = std::make_shared<pass::pattern::Matcher>(const_pattern, matcher_name);
     this->register_matcher(m, callback);
 }
+
+pass::MarkCompressedFloatConstants::MarkCompressedFloatConstants() {
+    MATCHER_SCOPE(MarkCompressedFloatConstants);
+
+    auto constant = pattern::wrap_type<ov::op::v0::Constant>();
+    auto convert = pattern::wrap_type<ov::op::v0::Convert>({constant});
+
+    matcher_pass_callback callback = [=](pattern::Matcher& m) {
+        const auto& convert_node = as_type_ptr<ov::op::v0::Convert>(m.get_match_root());
+        const auto& const_node = convert_node->input_value(0).get_node_shared_ptr();
+        if (convert_node == nullptr || const_node == nullptr)
+            return false;
+        if (convert_node->get_destination_type() != element::f32)
+            return false;
+        if (const_node->get_output_element_type(0) != element::f16 &&
+            const_node->get_output_element_type(0) != element::bf16)
+            return false;
+
+        mark_as_decompression(convert_node);
+        disable_constant_folding(const_node);
+        disable_constant_folding(convert_node);
+        return true;
+    };
+
+    auto m = std::make_shared<pass::pattern::Matcher>(convert, matcher_name);
+    this->register_matcher(m, callback);
+}
diff --git a/src/common/transformations/tests/common_optimizations/compress_float_constants_test.cpp b/src/common/transformations/tests/common_optimizations/compress_float_constants_test.cpp
index 2d256b5df6e128..9ed38a8c845a8d 100644
--- a/src/common/transformations/tests/common_optimizations/compress_float_constants_test.cpp
+++ b/src/common/transformations/tests/common_optimizations/compress_float_constants_test.cpp
@@ -14,6 +14,7 @@
 #include "openvino/opsets/opset8.hpp"
 #include "openvino/pass/manager.hpp"
 #include "transformations/common_optimizations/mark_precision_sensitive_shapeof_subgraphs.hpp"
+#include "transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp"
 #include "transformations/init_node_info.hpp"
 #include "transformations/utils/utils.hpp"
 using namespace ov;
@@ -515,3 +516,83 @@ TEST_F(TransformationTestsF, CompressConstants_compress_to_f16_denormal_vals) {
     }
     comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
 }
+
+TEST_F(TransformationTestsF, KeepFWPrecisionForFP16Constants_test_1) {
+    {
+        auto input = std::make_shared<ov::opset8::Parameter>(ov::element::f32, ov::Shape{1, 3, 12, 12});
+        auto const_weights = ov::op::v0::Constant::create(
+            ov::element::f16,
+            ov::Shape{1, 3, 3, 3},
+            {1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+        auto convert_node = std::make_shared<ov::op::v0::Convert>(const_weights, element::f32);
+
+        auto conv = std::make_shared<ov::opset8::Convolution>(input,
+                                                              convert_node,
+                                                              ov::Strides{1, 1},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::Strides{1, 1});
+        model = std::make_shared<ov::Model>(ov::NodeVector{conv}, ov::ParameterVector{input});
+
+        manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
+        manager.register_pass<ov::pass::CompressFloatConstants>();
+    }
+
+    {
+        auto input = std::make_shared<ov::opset8::Parameter>(ov::element::f32, ov::Shape{1, 3, 12, 12});
+        auto const_weights = ov::opset8::Constant::create(
+            ov::element::f16,
+            ov::Shape{1, 3, 3, 3},
+            {1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+        auto convert_node = std::make_shared<ov::op::v0::Convert>(const_weights, element::f32);
+        auto conv = std::make_shared<ov::opset8::Convolution>(input,
+                                                              convert_node,
+                                                              ov::Strides{1, 1},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::Strides{1, 1});
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{conv}, ov::ParameterVector{input});
+    }
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+}
+
+TEST_F(TransformationTestsF, KeepFWPrecisionForBF16Constants_test_1) {
+    {
+        auto input = std::make_shared<ov::opset8::Parameter>(ov::element::f32, ov::Shape{1, 3, 12, 12});
+        auto const_weights = ov::op::v0::Constant::create(
+            ov::element::bf16,
+            ov::Shape{1, 3, 3, 3},
+            {1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+        auto convert_node = std::make_shared<ov::op::v0::Convert>(const_weights, element::f32);
+
+        auto conv = std::make_shared<ov::opset8::Convolution>(input,
+                                                              convert_node,
+                                                              ov::Strides{1, 1},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::Strides{1, 1});
+        model = std::make_shared<ov::Model>(ov::NodeVector{conv}, ov::ParameterVector{input});
+
+        manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
+        manager.register_pass<ov::pass::CompressFloatConstants>();
+    }
+
+    {
+        auto input = std::make_shared<ov::opset8::Parameter>(ov::element::f32, ov::Shape{1, 3, 12, 12});
+        auto const_weights = ov::opset8::Constant::create(
+            ov::element::bf16,
+            ov::Shape{1, 3, 3, 3},
+            {1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+        auto convert_node = std::make_shared<ov::op::v0::Convert>(const_weights, element::f32);
+        auto conv = std::make_shared<ov::opset8::Convolution>(input,
+                                                              convert_node,
+                                                              ov::Strides{1, 1},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::Strides{1, 1});
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{conv}, ov::ParameterVector{input});
+    }
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+}
diff --git a/src/frontends/pytorch/src/frontend.cpp b/src/frontends/pytorch/src/frontend.cpp
index b1e61fbcae9085..ee9a9919045f72 100644
--- a/src/frontends/pytorch/src/frontend.cpp
+++ b/src/frontends/pytorch/src/frontend.cpp
@@ -16,6 +16,7 @@
 #include "transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.hpp"
 #include "transformations/common_optimizations/reverse_shape_and_type_infer.hpp"
 #include "transformations/control_flow/unroll_if.hpp"
+#include "transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp"
 #include "transformations/low_precision/mark_dequantization_subgraph.hpp"
 #include "transformations/op_conversions/convert_convertlike.hpp"
 #include "transformations/resolve_names_collisions.hpp"
@@ -176,9 +177,13 @@ void FrontEnd::normalize(const std::shared_ptr<ov::Model>& model) const {
     manager.register_pass<ov::pass::ConvertConvertLike>();
     manager.register_pass<ov::frontend::pytorch::pass::AtenIndexToSelect>();
 
+    // Mark quantized and f16/bf16 compressed constants to prevent CF for them,
+    // so that not extra memory is used for intermediate decompressed constants.
     manager.register_pass<ov::pass::MarkDequantizationSubgraph>(
         element::TypeVector{element::u8, element::i8, element::u4, element::i4});
+    manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
     manager.register_pass<ov::pass::ConstantFolding>();
+
     manager.register_pass<ov::frontend::pytorch::pass::AlignTypesRemoval>();
     manager.register_pass<ov::pass::PushConstantToSubgraph>();
     manager.register_pass<ov::pass::UnrollIf>();
diff --git a/src/frontends/tensorflow/src/frontend.cpp b/src/frontends/tensorflow/src/frontend.cpp
index c9eafa46890476..b017f7d954d65b 100644
--- a/src/frontends/tensorflow/src/frontend.cpp
+++ b/src/frontends/tensorflow/src/frontend.cpp
@@ -30,6 +30,7 @@
 #include "transformations/common_optimizations/remove_concat_zero_dim_input.hpp"
 #include "transformations/common_optimizations/reverse_shape_and_type_infer.hpp"
 #include "transformations/control_flow/unroll_if.hpp"
+#include "transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp"
 #include "transformations/resolve_names_collisions.hpp"
 #include "transformations/switch_merge_resolve.hpp"
 #include "transformations/transpose_sinking/ts_general.hpp"
@@ -507,6 +508,10 @@ void FrontEnd::convert(const std::shared_ptr<ov::Model>& partiallyConverted) con
 
 void FrontEnd::normalize(const std::shared_ptr<ov::Model>& model) const {
     ov::pass::Manager manager;
+
+    // Mark quantized and f16/bf16 compressed constants to prevent CF for them,
+    // so that not extra memory is used for intermediate decompressed constants.
+    manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
     manager.register_pass<pass::SavedModelUnusedRemover>();
     manager.register_pass<pass::EmbeddingSegmentSingleFeatureFusion>();
     manager.register_pass<pass::BlockLSTMReplacer>();
diff --git a/src/frontends/tensorflow_lite/src/frontend.cpp b/src/frontends/tensorflow_lite/src/frontend.cpp
index bb6f82fa55281b..5f58977743650a 100644
--- a/src/frontends/tensorflow_lite/src/frontend.cpp
+++ b/src/frontends/tensorflow_lite/src/frontend.cpp
@@ -16,6 +16,7 @@
 #include "tflite_transformations/rfft2d_complex_abs.h"
 #include "tflite_transformations/tflite_quantize_resolver.hpp"
 #include "transformations/common_optimizations/transpose_sinking.hpp"
+#include "transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp"
 #include "transformations/resolve_names_collisions.hpp"
 #include "transformations/transpose_sinking/ts_general.hpp"
 
@@ -284,6 +285,9 @@ std::shared_ptr<ov::Model> FrontEnd::decode(const InputModel::Ptr& model) const
 
 void FrontEnd::normalize(const std::shared_ptr<ov::Model>& function) const {
     ov::pass::Manager manager;
+    // Mark quantized and f16/bf16 compressed constants to prevent CF for them,
+    // so that not extra memory is used for intermediate decompressed constants.
+    manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
     manager.register_pass<ov::frontend::tensorflow_lite::pass::TFLQuantizeResolver>();
     manager.register_pass<ov::frontend::tensorflow_lite::pass::Rfft2dSimplifier>();
     manager.register_pass<ov::pass::TransposeSinking>();
diff --git a/tests/layer_tests/ovc_python_api_tests/test_pytorch.py b/tests/layer_tests/ovc_python_api_tests/test_pytorch.py
index 77a088fea8f411..1b5e1eda44ce7d 100644
--- a/tests/layer_tests/ovc_python_api_tests/test_pytorch.py
+++ b/tests/layer_tests/ovc_python_api_tests/test_pytorch.py
@@ -467,6 +467,27 @@ def create_pytorch_nn_module_scale_list_compression_enabled(tmp_dir):
                                  'compress_to_fp16': True, 'use_convert_model_from_mo': True}
 
 
+def create_pytorch_nn_module_with_compressed_constants(tmp_dir):
+    import torch
+
+    class NeuralNetwork(torch.nn.Module):
+        def __init__(self):
+            super(NeuralNetwork, self).__init__()
+            self.y = torch.arange(10, dtype=torch.float16)
+
+        def forward(self, x):
+            return x + self.y.to(torch.float32)
+
+    param_1 = ov.opset13.parameter([10], dtype=np.float32)
+    const_1 = ov.opset13.constant(np.arange(10), dtype=np.float16)
+    convert_1 = ov.opset13.convert(const_1, np.float32)
+    add_1 = ov.opset13.add(param_1, convert_1)
+
+    ov_model_ref = Model([add_1], [param_1], "test")
+    fw_model = NeuralNetwork()
+    return fw_model, ov_model_ref, {'input': [([10], np.float32)]}
+
+
 def create_pytorch_nn_module_shapes_list_static(tmp_dir):
     pt_model = make_pt_model_two_inputs()
     ref_model = make_ref_pt_model_two_inputs([1, 3, 20, 20])
@@ -1020,6 +1041,7 @@ class TestMoConvertPyTorch(CommonMOConvertTest):
         create_pytorch_nn_module_scale_list_compression_default,
         create_pytorch_nn_module_scale_list_compression_disabled,
         create_pytorch_nn_module_scale_list_compression_enabled,
+        create_pytorch_nn_module_with_compressed_constants,
         create_pytorch_nn_module_shapes_list_static,
         create_pytorch_nn_module_shapes_list_static_via_input,
         create_pytorch_nn_module_shapes_list_dynamic,
diff --git a/tests/layer_tests/ovc_python_api_tests/test_tf.py b/tests/layer_tests/ovc_python_api_tests/test_tf.py
index 5f0b8fa5a37253..179de425284416 100644
--- a/tests/layer_tests/ovc_python_api_tests/test_tf.py
+++ b/tests/layer_tests/ovc_python_api_tests/test_tf.py
@@ -535,6 +535,28 @@ def __call__(self, input1, input2):
     return model, model_ref, {}
 
 
+def create_keras_layer_with_compressed_constants(tmp_dir):
+    import tensorflow as tf
+
+    class LayerModel(tf.Module):
+        def __init__(self):
+            super(LayerModel, self).__init__()
+            self.const = tf.constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], shape=[10], dtype=tf.float16)
+
+        @tf.function(input_signature=[tf.TensorSpec([10], tf.float32)])
+        def __call__(self, input_1):
+            return input_1 + tf.cast(self.const, dtype=tf.float32)
+
+    param_1 = ov.opset13.parameter([10], dtype=np.float32)
+    const_1 = ov.opset13.constant(np.arange(10), dtype=np.float16)
+    convert_1 = ov.opset13.convert(const_1, np.float32)
+    add_1 = ov.opset13.add(param_1, convert_1)
+
+    ov_model_ref = Model([add_1], [param_1], "test")
+    fw_model = LayerModel()
+    return fw_model, ov_model_ref, {}
+
+
 def create_keras_layer_with_tf_function_call_no_signature(tmp_dir):
     class LayerModel(tf.Module):
         def __init__(self):
@@ -673,6 +695,7 @@ class TestMoConvertTF(CommonMOConvertTest):
         create_keras_layer_with_input_shapes_case4,
         create_keras_layer_with_tf_function_call,
         create_keras_layer_with_tf_function_call_default_compressed_to_fp16,
+        create_keras_layer_with_compressed_constants,
         create_keras_layer_with_tf_function_call_no_signature,
         create_keras_layer_with_tf_function_call_no_signature_single_input,
         create_keras_layer_with_string_tensor,

From 3a97c299e6fb48561907fd099cde92bc19b71889 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 15 Jan 2024 15:21:43 +0400
Subject: [PATCH 13/13] Fixed GHSA-h5c8-rqwp-cp95 (#22159)

---
 docs/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 77b88045445dd5..34a15149c88d29 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -13,7 +13,7 @@ imagesize==1.2.0
 importlib-metadata==4.4.0
 iniconfig==1.1.1
 ipython==8.10.0
-Jinja2==3.1.2
+Jinja2==3.1.3
 lxml>=4.9.2
 MarkupSafe==2.1.1
 mistune==2.0.3