diff --git a/.github/dockerfiles/docker_tag b/.github/dockerfiles/docker_tag
index 8e5386a30ec997..56faa37d1da67f 100644
--- a/.github/dockerfiles/docker_tag
+++ b/.github/dockerfiles/docker_tag
@@ -1 +1 @@
-pr-26656
\ No newline at end of file
+pr-26993
\ No newline at end of file
diff --git a/.github/dockerfiles/ov_build/ubuntu_20_04_arm64/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_20_04_arm64/Dockerfile
index c7d0e95164f414..7653fe6abb7434 100644
--- a/.github/dockerfiles/ov_build/ubuntu_20_04_arm64/Dockerfile
+++ b/.github/dockerfiles/ov_build/ubuntu_20_04_arm64/Dockerfile
@@ -35,7 +35,7 @@ RUN apt-get update && \
         libhdf5-dev \
         # For Java API
         default-jdk \
-        # Compiler
+        # Compiler, required for multi-isa build
         gcc-10 \
         g++-10 \
         && \
diff --git a/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile
index 53829ad50b2975..1620e674ef67d5 100644
--- a/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile
+++ b/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile
@@ -30,9 +30,6 @@ RUN apt-get update && \
         python3.9-distutils \
         # For Java API
         default-jdk \
-        # Compiler \
-        gcc-10 \
-        g++-10 \
         && \
     rm -rf /var/lib/apt/lists/*
 
@@ -42,10 +39,6 @@ RUN chmod +x /install_build_dependencies.sh && \
     /install_build_dependencies.sh && \
     rm -rf /var/lib/apt/lists/*
 
-# Set gcc-10 as a default compiler
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30 && \
-    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30
-
 # Install sscache
 ARG SCCACHE_VERSION="v0.7.5"
 ENV SCCACHE_HOME="/opt/sccache" \
diff --git a/.github/dockerfiles/ov_build/ubuntu_20_04_x64_nvidia/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_20_04_x64_nvidia/Dockerfile
index 5df369bbb6398a..0a4d7ef90aa115 100644
--- a/.github/dockerfiles/ov_build/ubuntu_20_04_x64_nvidia/Dockerfile
+++ b/.github/dockerfiles/ov_build/ubuntu_20_04_x64_nvidia/Dockerfile
@@ -35,9 +35,6 @@ RUN apt-get update && \
         python3.11-distutils \
         # For Java API
         default-jdk \
-        # Compiler \
-        gcc-10 \
-        g++-10 \
         && \
     rm -rf /var/lib/apt/lists/*
 
@@ -47,10 +44,6 @@ RUN chmod +x /install_build_dependencies.sh && \
     /install_build_dependencies.sh && \
     rm -rf /var/lib/apt/lists/*
 
-# Set gcc-10 as a default compiler
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30 && \
-    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30
-
 # Install sscache
 ARG SCCACHE_VERSION="v0.7.5"
 ENV SCCACHE_HOME="/opt/sccache" \
diff --git a/.github/github_org_control/configs.py b/.github/github_org_control/configs.py
index 872638bb657fdf..3df12803c77de0 100644
--- a/.github/github_org_control/configs.py
+++ b/.github/github_org_control/configs.py
@@ -14,8 +14,8 @@
 from pathlib import Path
 
 
-if sys.version_info[:2] < (3, 8):
-    raise Exception("Python version must be >= 3.8")
+if sys.version_info[:2] < (3, 9):
+    raise Exception("Python version must be >= 3.9")
 
 
 class ConfigException(Exception):
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 5421d669ed224f..daa5375b175bd3 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -76,6 +76,8 @@
 
 'category: NPUW':
 - 'src/plugins/intel_npu/src/plugin/npuw/**/*'
+- 'src/plugins/intel_npu/tests/functional/behavior/npuw/**/*'
+- 'src/plugins/intel_npu/tests/unit/behavior/npuw/**/*'
 
 'category: HETERO':
 - 'src/plugins/hetero/**/*'
diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml
index 604ca0fdb81b29..6a163fb5e50043 100644
--- a/.github/workflows/coverity.yml
+++ b/.github/workflows/coverity.yml
@@ -112,7 +112,7 @@ jobs:
       - name: Pack Artefacts
         run: |
           pushd ${BUILD_DIR}
-            tar -C ${BUILD_DIR} -I pigz -cvf openvino.tgz cov-int
+            tar -cvf - cov-int | pigz > openvino.tgz
           popd
 
       - name: Submit artefacts
diff --git a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml
index c2da4c1b2d2f9c..83770900559bab 100644
--- a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml
+++ b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml
@@ -158,11 +158,11 @@ jobs:
         run: |
 
           pushd ${INSTALL_DIR}
-            tar -I pigz -cvf ${BUILD_DIR}/openvino_package.tar.gz *
+            tar -cvf - * | pigz > ${BUILD_DIR}/openvino_package.tar.gz
           popd
 
           pushd ${INSTALL_TEST_DIR}
-            tar -I pigz -cvf ${BUILD_DIR}/openvino_tests.tar.gz *
+            tar -cvf - * | pigz > ${BUILD_DIR}/openvino_tests.tar.gz
           popd
 
       #
@@ -230,11 +230,11 @@ jobs:
       - name: Extract OpenVINO packages
         run: |
           pushd $INSTALL_DIR
-            tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
           pushd $INSTALL_TEST_DIR
-            tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
       - name: Snippets func tests
@@ -287,11 +287,11 @@ jobs:
       - name: Extract OpenVINO packages
         run: |
           pushd $INSTALL_DIR
-            tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
           pushd $INSTALL_TEST_DIR
-            tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
       - name: Fetch setup_python action
diff --git a/.github/workflows/job_build_linux.yml b/.github/workflows/job_build_linux.yml
index d58e879c736610..b8eea4375e7e58 100644
--- a/.github/workflows/job_build_linux.yml
+++ b/.github/workflows/job_build_linux.yml
@@ -182,15 +182,15 @@ jobs:
         working-directory: ${{ env.BUILD_DIR }}
           
       - name: Pack openvino_package
-        run: tar -I pigz -cvf ${BUILD_DIR}/openvino_package.tar.gz *
+        run: tar -cvf - * | pigz > ${BUILD_DIR}/openvino_package.tar.gz
         working-directory: ${{ env.INSTALL_DIR }}
         
       - name: Pack openvino_developer_package
-        run: tar -I pigz -cvf ${BUILD_DIR}/openvino_developer_package.tar.gz *
+        run: tar -cvf - * | pigz > ${BUILD_DIR}/openvino_developer_package.tar.gz
         working-directory: ${{ env.DEVELOPER_PACKAGE_DIR }}
         
       - name: Pack openvino_tests
-        run: tar -I pigz -cvf ${BUILD_DIR}/openvino_tests.tar.gz *
+        run: tar -cvf - * | pigz > ${BUILD_DIR}/openvino_tests.tar.gz
         working-directory: ${{ env.INSTALL_TEST_DIR }}
       
       - name: Build Debian packages
diff --git a/.github/workflows/job_cpu_functional_tests.yml b/.github/workflows/job_cpu_functional_tests.yml
index 77376d442939a0..24c8542ae80140 100644
--- a/.github/workflows/job_cpu_functional_tests.yml
+++ b/.github/workflows/job_cpu_functional_tests.yml
@@ -55,15 +55,19 @@ jobs:
           echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
           echo "PARALLEL_TEST_SCRIPT=$GITHUB_WORKSPACE/install/tests/functional_test_utils/layer_tests_summary/run_parallel.py" >> "$GITHUB_ENV"
           echo "PARALLEL_TEST_CACHE=$GITHUB_WORKSPACE/install/tests/test_cache.lst" >> "$GITHUB_ENV"
-
+          
+      - name: Install OpenVINO dependencies (mac)
+        if: runner.os == 'macOS'
+        run: brew install pigz
+      
       - name: Extract OpenVINO packages
         run: |
           pushd $INSTALL_DIR
-            tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
           pushd $INSTALL_TEST_DIR
-            tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
       - name: Fetch setup_python action
diff --git a/.github/workflows/job_cxx_unit_tests.yml b/.github/workflows/job_cxx_unit_tests.yml
index b83e83af4ed68c..99c363d04d23a7 100644
--- a/.github/workflows/job_cxx_unit_tests.yml
+++ b/.github/workflows/job_cxx_unit_tests.yml
@@ -60,6 +60,10 @@ jobs:
           echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
           echo "SETUPVARS_COMMAND=${{ env.SOURCE_COMMAND }} $GITHUB_WORKSPACE/install/${{ env.SETUPVARS }}" >> "$GITHUB_ENV"
 
+      - name: Install OpenVINO dependencies (mac)
+        if: runner.os == 'macOS'
+        run: brew install pigz
+          
       - name: Setup Variables (Windows)
         if: ${{ runner.os == 'Windows' }}
         run: Add-Content -Path $env:GITHUB_ENV -Value "SETUPVARS_COMMAND=${{ env.SOURCE_COMMAND }} ${{ github.workspace }}/install/${{ env.SETUPVARS }}"
@@ -68,10 +72,10 @@ jobs:
         if: ${{ runner.os != 'Windows' }}
         run: |
           pushd $INSTALL_DIR
-            tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
           pushd $INSTALL_TEST_DIR
-            tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
       - name: Extract OpenVINO packages (Windows)
diff --git a/.github/workflows/job_gpu_tests.yml b/.github/workflows/job_gpu_tests.yml
index 147afcccddfe17..324e653c57ebab 100644
--- a/.github/workflows/job_gpu_tests.yml
+++ b/.github/workflows/job_gpu_tests.yml
@@ -59,10 +59,10 @@ jobs:
       - name: Extract OpenVINO packages
         run: |
           pushd $INSTALL_DIR
-            tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
           pushd $INSTALL_TEST_DIR
-            tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
       - name: Install dependencies (Linux)
diff --git a/.github/workflows/job_jax_models_tests.yml b/.github/workflows/job_jax_models_tests.yml
index 2fed97a78e9c07..9956a27f234b36 100644
--- a/.github/workflows/job_jax_models_tests.yml
+++ b/.github/workflows/job_jax_models_tests.yml
@@ -60,7 +60,7 @@ jobs:
           
       - name: Extract OpenVINO packages and tests
         run: |
-            tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
       - name: Fetch setup_python action
diff --git a/.github/workflows/job_onnx_models_tests.yml b/.github/workflows/job_onnx_models_tests.yml
index 0eda00f7afb937..321aa88d614310 100644
--- a/.github/workflows/job_onnx_models_tests.yml
+++ b/.github/workflows/job_onnx_models_tests.yml
@@ -64,7 +64,7 @@ jobs:
 
       - name: Extract OpenVINO packages and tests
         run: |
-            tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
       # Issue 148922
diff --git a/.github/workflows/job_onnx_runtime.yml b/.github/workflows/job_onnx_runtime.yml
index 61b13939fc60b7..0ceb080d82184d 100644
--- a/.github/workflows/job_onnx_runtime.yml
+++ b/.github/workflows/job_onnx_runtime.yml
@@ -59,7 +59,7 @@ jobs:
       - name: Extract OpenVINO package
         run: |
           pushd ${INSTALL_DIR}
-            tar -I pigz -xf openvino_package.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
           
       - name: Fetch ONNX runtime version and skip tests list
diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml
index 8db2ebf86dca91..d63262c665d45c 100644
--- a/.github/workflows/job_python_unit_tests.yml
+++ b/.github/workflows/job_python_unit_tests.yml
@@ -66,9 +66,13 @@ jobs:
           echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
           echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV"
 
-      - name: Extract OpenVINO artifacts
+      - name: Install OpenVINO dependencies (mac)
+        if: runner.os == 'macOS'
+        run: brew install pigz
+          
+      - name: Extract OpenVINO packages
         run: |
-            tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
       - name: Fetch setup_python action
diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml
index c4f0d1efb37c75..95074dc84f1ff9 100644
--- a/.github/workflows/job_pytorch_layer_tests.yml
+++ b/.github/workflows/job_pytorch_layer_tests.yml
@@ -65,11 +65,15 @@ jobs:
           echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
           echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
           echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV"
-         
-      - name: Extract OpenVINO artifacts (Linux, macOS)
+  
+      - name: Install OpenVINO dependencies (mac)
+        if: runner.os == 'macOS'
+        run: brew install pigz
+          
+      - name: Extract OpenVINO packages (Linux, macOS)
         if: runner.os != 'Windows'
         run: |
-            tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
       - name: Extract OpenVINO artifacts (Windows)
diff --git a/.github/workflows/job_pytorch_models_tests.yml b/.github/workflows/job_pytorch_models_tests.yml
index ce40dd7f0618ce..a77c1318f3a0c8 100644
--- a/.github/workflows/job_pytorch_models_tests.yml
+++ b/.github/workflows/job_pytorch_models_tests.yml
@@ -73,7 +73,7 @@ jobs:
 
       - name: Extract OpenVINO artifacts
         run: |
-            tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
       - name: Fetch setup_python action
diff --git a/.github/workflows/job_samples_tests.yml b/.github/workflows/job_samples_tests.yml
index 12c63644d7b586..7cde4e6fd18eae 100644
--- a/.github/workflows/job_samples_tests.yml
+++ b/.github/workflows/job_samples_tests.yml
@@ -54,17 +54,17 @@ jobs:
           echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV"
           echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
           echo "BUILD_DIR=$GITHUB_WORKSPACE/build" >> "$GITHUB_ENV"
+      
+      - name: Install OpenVINO dependencies (mac)
+        if: runner.os == 'macOS'
+        run: brew install coreutils pigz
 
-      - name: Extract OpenVINO packages, wheels and tests
+      - name: Extract OpenVINO packages and tests
         run: |
-            tar -I pigz -xf openvino_package.tar.gz -C ${INSTALL_DIR}
-            tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
-      - name: Install OpenVINO dependencies (mac)
-        if: runner.os == 'macOS'
-        run: brew install coreutils
-
       - name: Fetch setup_python action
         # Python is already installed on Ubuntu within Dockerfile
         if: runner.os != 'Linux'
diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml
index 3ad19d3301945f..ae6e91a00d1497 100644
--- a/.github/workflows/job_tensorflow_layer_tests.yml
+++ b/.github/workflows/job_tensorflow_layer_tests.yml
@@ -66,10 +66,14 @@ jobs:
           echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV"
           echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV"
 
-      - name: Extract OpenVINO artifacts (Linux and macOS)
+      - name: Install OpenVINO dependencies (mac)
+        if: runner.os == 'macOS'
+        run: brew install pigz
+          
+      - name: Extract OpenVINO packages (Linux, macOS)
         if: runner.os != 'Windows'
         run: |
-            tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
       - name: Extract OpenVINO artifacts (Windows)
diff --git a/.github/workflows/job_tensorflow_models_tests.yml b/.github/workflows/job_tensorflow_models_tests.yml
index 76ee01cc76c3ef..db34ec7b793551 100644
--- a/.github/workflows/job_tensorflow_models_tests.yml
+++ b/.github/workflows/job_tensorflow_models_tests.yml
@@ -65,7 +65,7 @@ jobs:
 
       - name: Extract OpenVINO artifacts (Linux and macOS)
         run: |
-          tar -I pigz -xf openvino_tests.tar.gz -C .
+          pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
         working-directory: ${{ env.INSTALL_DIR }}
 
       - name: Fetch setup_python action
diff --git a/.github/workflows/job_tokenizers.yml b/.github/workflows/job_tokenizers.yml
index 089b104d7af1d1..238dbfec3a34eb 100644
--- a/.github/workflows/job_tokenizers.yml
+++ b/.github/workflows/job_tokenizers.yml
@@ -58,6 +58,10 @@ jobs:
             .github/actions/setup_python
             .github/actions/cache
             install_build_dependencies.sh
+            
+      - name: Install OpenVINO dependencies (mac)
+        if: runner.os == 'macOS'
+        run: brew install pigz
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: ./.github/actions/setup_python
diff --git a/.github/workflows/linux_conditional_compilation.yml b/.github/workflows/linux_conditional_compilation.yml
index 6f9b761ce3352c..7b5467b01ad73e 100644
--- a/.github/workflows/linux_conditional_compilation.yml
+++ b/.github/workflows/linux_conditional_compilation.yml
@@ -200,23 +200,23 @@ jobs:
       - name: Pack Artifacts
         run: |
           pushd ${SELECTIVE_BUILD_STAT_DIR}
-            tar -I pigz -cvf ${BUILD_DIR}/openvino_selective_build_stat.tar.gz *
+            tar -cvf - * | pigz > ${BUILD_DIR}/openvino_selective_build_stat.tar.gz
           popd
 
           pushd ${INSTALL_DIR}
-            tar -I pigz -cvf ${BUILD_DIR}/openvino_package.tar.gz \
-              install_dependencies/install_openvino_dependencies.sh
+            tar -cvf - install_dependencies/install_openvino_dependencies.sh | pigz > ${BUILD_DIR}/openvino_package.tar.gz
           popd
 
           cp -v ${OPENVINO_REPO}/temp/tbb/lib/lib* ${INSTALL_TEST_DIR}/tests
           pushd ${INSTALL_TEST_DIR}
-            tar -I pigz -cvf ${BUILD_DIR}/openvino_tests.tar.gz \
+            tar -cvf - \
               tests/ov_cpu_func_tests \
               tests/libopenvino_template_extension.so \
               tests/libze_loader.so* \
               tests/libhwloc* \
               tests/libtbb* \
-              tests/functional_test_utils/layer_tests_summary/*
+              tests/functional_test_utils/layer_tests_summary/* \
+              | pigz > ${BUILD_DIR}/openvino_tests.tar.gz
           popd
 
       #
@@ -302,7 +302,8 @@ jobs:
           path: ${{ env.SELECTIVE_BUILD_STAT_DIR }}
 
       - name: Extract selective build statistics package
-        run: tar -I pigz -xvf ${SELECTIVE_BUILD_STAT_DIR}/openvino_selective_build_stat.tar.gz -C ${SELECTIVE_BUILD_STAT_DIR}
+        run: |
+          pigz -dc ${SELECTIVE_BUILD_STAT_DIR}/openvino_selective_build_stat.tar.gz | tar -xf - -C ${SELECTIVE_BUILD_STAT_DIR}
 
       #
       # Build
diff --git a/.github/workflows/linux_sanitizers.yml b/.github/workflows/linux_sanitizers.yml
index cec499e7971130..b23e67a0f2b30e 100644
--- a/.github/workflows/linux_sanitizers.yml
+++ b/.github/workflows/linux_sanitizers.yml
@@ -175,11 +175,11 @@ jobs:
       - name: Pack Artifacts
         run: |
           pushd ${INSTALL_DIR}
-            tar -I pigz -cvf ${BUILD_DIR}/openvino_package.tar.gz *
+            tar -cvf - * | pigz > ${BUILD_DIR}/openvino_package.tar.gz
           popd
 
           pushd ${INSTALL_TEST_DIR}
-            tar -I pigz -cvf ${BUILD_DIR}/openvino_tests.tar.gz *
+            tar -cvf - * | pigz > ${BUILD_DIR}/openvino_tests.tar.gz
           popd
 
       #
@@ -257,10 +257,10 @@ jobs:
       - name: Extract OpenVINO packages
         run: |
           pushd $INSTALL_DIR
-            tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
           pushd $INSTALL_TEST_DIR
-            tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
       - name: Install dependencies (Linux)
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index b91bd65465621a..6e3f344c6dd944 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -77,6 +77,7 @@ jobs:
       INSTALL_DIR: ${{ github.workspace }}/openvino_install
       INSTALL_DIR_JS: ${{ github.workspace }}/openvino_install/js
       INSTALL_TEST_DIR: ${{ github.workspace }}/tests_install
+      INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels
       BUILD_DIR: ${{ github.workspace }}/build
     if: "!needs.smart_ci.outputs.skip_workflow"
     steps:
@@ -104,7 +105,7 @@ jobs:
       #
 
       - name: Install build dependencies
-        run: brew install coreutils ninja scons
+        run: brew install coreutils ninja scons pigz
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: ./openvino/.github/actions/setup_python
@@ -167,16 +168,15 @@ jobs:
         run: |
           cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake
           cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake
-          cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake
+          cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake
 
       - name: Pack Artifacts
         run: |
           pushd ${{ env.INSTALL_DIR }}
-            tar -I pigz -cvf ${{ env.BUILD_DIR }}/openvino_package.tar.gz *
+            tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_package.tar.gz
           popd
-
           pushd ${{ env.INSTALL_TEST_DIR }}
-            tar -I pigz -cvf ${{ env.BUILD_DIR }}/openvino_tests.tar.gz *
+            tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz 
           popd
 
       - name: Cmake & Build - OpenVINO Contrib
@@ -210,6 +210,13 @@ jobs:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
           if-no-files-found: 'error'
+          
+      - name: Upload openvino wheels
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: openvino_wheels
+          path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl
+          if-no-files-found: 'error'
 
       - name: Upload openvino tests package
         if: ${{ always() }}
diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml
index 8d4843627e7b9f..16658318de20d8 100644
--- a/.github/workflows/mac_arm64.yml
+++ b/.github/workflows/mac_arm64.yml
@@ -77,6 +77,7 @@ jobs:
       INSTALL_DIR: ${{ github.workspace }}/openvino_install
       INSTALL_DIR_JS: ${{ github.workspace }}/openvino_install/js
       INSTALL_TEST_DIR: ${{ github.workspace }}/tests_install
+      INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels
       BUILD_DIR: ${{ github.workspace }}/build
     if: "!needs.smart_ci.outputs.skip_workflow"
     steps:
@@ -104,7 +105,7 @@ jobs:
       #
 
       - name: Install build dependencies
-        run: brew install coreutils ninja scons
+        run: brew install coreutils ninja scons pigz
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: ./openvino/.github/actions/setup_python
@@ -167,16 +168,16 @@ jobs:
         run: |
           cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake
           cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake
-          cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake
+          cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake
 
       - name: Pack Artifacts
         run: |
           pushd ${{ env.INSTALL_DIR }}
-            tar -I pigz -cvf ${{ env.BUILD_DIR }}/openvino_package.tar.gz *
+            tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_package.tar.gz
           popd
 
           pushd ${{ env.INSTALL_TEST_DIR }}
-            tar -I pigz -cvf ${{ env.BUILD_DIR }}/openvino_tests.tar.gz *
+            tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz
           popd
 
       - name: Cmake & Build - OpenVINO Contrib
@@ -210,6 +211,13 @@ jobs:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
           if-no-files-found: 'error'
+          
+      - name: Upload openvino wheels
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: openvino_wheels
+          path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl
+          if-no-files-found: 'error'
 
       - name: Upload openvino tests package
         if: ${{ always() }}
diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml
index 19f63471523726..2ebca2b059fdd2 100644
--- a/.github/workflows/ubuntu_22.yml
+++ b/.github/workflows/ubuntu_22.yml
@@ -176,10 +176,10 @@ jobs:
       - name: Extract OpenVINO packages
         run: |
           pushd ${INSTALL_DIR}
-            tar -I pigz -xf openvino_package.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
           pushd ${INSTALL_TEST_DIR}
-            tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
       - name: Fetch setup_python action
@@ -222,7 +222,7 @@ jobs:
         if: ${{ always() }}
         run: |
           pushd ${CONFORMANCE_ARTIFACTS_DIR}
-            tar -I pigz -cvf ${CONFORMANCE_ARTIFACTS_DIR}/conformance_artifacts.tar.gz *
+            tar -cvf - * | pigz > ${CONFORMANCE_ARTIFACTS_DIR}/conformance_artifacts.tar.gz
           popd
 
       - name: Upload Conformance Artifacts
@@ -248,7 +248,7 @@ jobs:
         if: ${{ matrix.TEST_TYPE == 'API' }}
         run: |
           pushd ${CONFORMANCE_ARTIFACTS_DIR}
-          tar -I pigz -cvf ${CONFORMANCE_ARTIFACTS_DIR}/conformance_artifacts.tar.gz *
+            tar -cvf - * | pigz > ${CONFORMANCE_ARTIFACTS_DIR}/conformance_artifacts.tar.gz
           popd
 
       - name: Upload Conformance Artifacts
@@ -451,11 +451,11 @@ jobs:
       - name: Extract OpenVINO packages
         run: |
           pushd ${INSTALL_DIR}
-            tar -I pigz -xf openvino_package.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
           pushd ${INSTALL_DIR}
-            tar -I pigz -xf openvino_developer_package.tar.gz -C ${INSTALL_DIR}
+            pigz -dc openvino_developer_package.tar.gz | tar -xf - -C ${INSTALL_DIR}
           popd
 
       - name: Clone OpenVINO Contrib
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7169ebc2ba2c9b..c30ce12665ab33 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -115,6 +115,7 @@ Choose the component your Good First Issue is related to. You can run tests to m
 - [C API](https://github.com/openvinotoolkit/openvino/tree/master/src/bindings/c)
 - [Core](https://github.com/openvinotoolkit/openvino/tree/master/src/core)
 - [Python API](https://github.com/openvinotoolkit/openvino/tree/master/src/bindings/python)
+- [Node.js API](https://github.com/openvinotoolkit/openvino/tree/master/src/bindings/js/node)
 
 ##### Frontends
 - [IR Frontend](https://github.com/openvinotoolkit/openvino/tree/master/src/frontends/ir)
diff --git a/cmake/developer_package/ncc_naming_style/ncc_naming_style.cmake b/cmake/developer_package/ncc_naming_style/ncc_naming_style.cmake
index d20582b03cb9fc..67a58d56e901e2 100644
--- a/cmake/developer_package/ncc_naming_style/ncc_naming_style.cmake
+++ b/cmake/developer_package/ncc_naming_style/ncc_naming_style.cmake
@@ -80,11 +80,11 @@ if(ENABLE_NCC_STYLE)
         set(CMAKE_FIND_LIBRARY_PREFIXES ${_old_CMAKE_FIND_LIBRARY_PREFIXES})
         set(CMAKE_FIND_LIBRARY_SUFFIXES ${_old_CMAKE_FIND_LIBRARY_SUFFIXES})
     else()
-        find_host_package(Clang QUIET)
-    endif()
-
-    if(Clang_FOUND AND TARGET libclang)
-        get_target_property(libclang_location libclang LOCATION)
+        find_host_library(libclang_location
+            NAMES clang libclang libclang-${clang_version} libclang-${clang_version}.so libclang-${clang_version}.so.1
+            PATHS /usr/lib /usr/local/lib /usr/lib/llvm-${clang_version}/lib /usr/lib/x86_64-linux-gnu
+            NO_DEFAULT_PATH
+            NO_CMAKE_FIND_ROOT_PATH)
     endif()
 
     if(NOT libclang_location)
diff --git a/cmake/developer_package/ncc_naming_style/requirements_dev.txt b/cmake/developer_package/ncc_naming_style/requirements_dev.txt
index a304b713cb3a2c..724ea2bf15721d 100644
--- a/cmake/developer_package/ncc_naming_style/requirements_dev.txt
+++ b/cmake/developer_package/ncc_naming_style/requirements_dev.txt
@@ -1,4 +1,3 @@
-clang==12.0.1; python_version == '3.8'
 clang==12.0.1; python_version == '3.9'
 clang==14.0; python_version == '3.10'
 clang==14.0; python_version == '3.11'
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index eedfe078cbd552..2dfb6bb8d04e81 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -8,7 +8,6 @@ set(ENABLE_CPP_API OFF CACHE BOOL "Build with C/C++ API.")
 set(ENABLE_PYTHON_API OFF CACHE BOOL "Build with Python API.")
 set(ENABLE_GENAI_API OFF CACHE BOOL "Build with GenAI API.")
 set(ENABLE_NOTEBOOKS OFF CACHE BOOL "Build with openvino notebooks.")
-set(ENABLE_OMZ OFF CACHE BOOL "Build with open_model_zoo.")
 set(ENABLE_OVMS OFF CACHE BOOL "Build with ovms.")
 set(OVMS_DOCS_DIR "" CACHE PATH "Path to model server documentation dir.")
 
@@ -90,17 +89,6 @@ function(build_docs)
         list(APPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "FINISHED preprocessing OVMS")
     endif()
 
-    if(${ENABLE_OMZ})
-        list(APPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "STARTED preprocessing OMZ")
-        list(APPEND commands
-            COMMAND ${Python3_EXECUTABLE} ${OpenVINO_SOURCE_DIR}/thirdparty/open_model_zoo/ci/prepare-documentation.py ${CMAKE_BINARY_DIR}/open_model_zoo)
-        list(APPEND commands COMMAND ${Python3_EXECUTABLE} ${FILE_HELPER_SCRIPT}
-            --filetype=md
-            --input_dir=${CMAKE_BINARY_DIR}/open_model_zoo
-            --output_dir=${SPHINX_SOURCE_DIR}
-            --exclude_dir=${SPHINX_SOURCE_DIR})
-        list(APPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "FINISHED preprocessing OMZ")
-    endif()
 
     # Preprocess docs
     add_custom_target(preprocess_docs
diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst
index aa60c44a2ad5c8..40b94210f6c43d 100644
--- a/docs/articles_en/about-openvino/performance-benchmarks.rst
+++ b/docs/articles_en/about-openvino/performance-benchmarks.rst
@@ -18,7 +18,7 @@ Performance Benchmarks
 
 This page presents benchmark results for
 `Intel® Distribution of OpenVINO™ toolkit <https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html>`__
-and :doc:`OpenVINO Model Server <../ovms_what_is_openvino_model_server>`, for a representative
+and :doc:`OpenVINO Model Server <../openvino-workflow/model-server/ovms_what_is_openvino_model_server>`, for a representative
 selection of public neural networks and Intel® devices. The results may help you decide which
 hardware to use in your applications or plan AI workload for the hardware you have already
 implemented in your solutions. Click the buttons below to see the chosen benchmark data.
diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst
index ea76392be4e2e6..2d5598a5eb8e9d 100644
--- a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst
+++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst
@@ -17,7 +17,7 @@ In this release, one person performs the role of both the Model Developer and th
 Overview
 ########
 
-The OpenVINO™ Security Add-on works with the :doc:`OpenVINO™ Model Server <../../ovms_what_is_openvino_model_server>` on Intel® architecture. Together, the OpenVINO™ Security Add-on and the OpenVINO™ Model Server provide a way for Model Developers and Independent Software Vendors to use secure packaging and secure model execution to enable access control to the OpenVINO™ models, and for model Users to run inference within assigned limits.
+The OpenVINO™ Security Add-on works with the :doc:`OpenVINO™ Model Server <../../openvino-workflow/model-server/ovms_what_is_openvino_model_server>` on Intel® architecture. Together, the OpenVINO™ Security Add-on and the OpenVINO™ Model Server provide a way for Model Developers and Independent Software Vendors to use secure packaging and secure model execution to enable access control to the OpenVINO™ models, and for model Users to run inference within assigned limits.
 
 The OpenVINO™ Security Add-on consists of three components that run in Kernel-based Virtual Machines (KVMs). These components provide a way to run security-sensitive operations in an isolated environment. A brief description of the three components are as follows. Click each triangled line for more information about each.
 
diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst
index a77527db114bc7..4585ca97488023 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst
@@ -102,7 +102,7 @@ Use the following code snippet to change the default settings:
 
       .. code-block:: python
 
-         pipeline_config = { "MAX_PROMPT_LEN": 1500, "MIN_RESPONSE_LEN": 500 }
+         pipeline_config = { "MAX_PROMPT_LEN": 1024, "MIN_RESPONSE_LEN": 512 }
          pipe = ov_genai.LLMPipeline(model_path, "NPU", pipeline_config)
 
    .. tab-item:: C++
@@ -110,7 +110,7 @@ Use the following code snippet to change the default settings:
 
       .. code-block:: cpp
 
-         ov::AnyMap pipeline_config = { { "MAX_PROMPT_LEN",  1500 }, { "MIN_RESPONSE_LEN", 500 } };
+         ov::AnyMap pipeline_config = { { "MAX_PROMPT_LEN",  1024 }, { "MIN_RESPONSE_LEN", 512 } };
          ov::genai::LLMPipeline pipe(model_path, "NPU", pipeline_config);
 
 
diff --git a/docs/articles_en/openvino-workflow.rst b/docs/articles_en/openvino-workflow.rst
index 0dda91f91fb552..942d6ed4b13a96 100644
--- a/docs/articles_en/openvino-workflow.rst
+++ b/docs/articles_en/openvino-workflow.rst
@@ -89,7 +89,7 @@ OpenVINO uses the following functions for reading, converting, and saving models
 |    Deploy a model locally, reading the file directly from your application and utilizing about-openvino/additional-resources available to the system.
 |    Deployment on a local system uses the steps described in the section on running inference.
 
-| :doc:`Deployment Option 2. Using Model Server <ovms_what_is_openvino_model_server>`
+| :doc:`Deployment Option 2. Using Model Server <openvino-workflow/model-server/ovms_what_is_openvino_model_server>`
 |    Deploy a model remotely, connecting your application to an inference server and utilizing external about-openvino/additional-resources, with no impact on the app's performance.
 |    Deployment on OpenVINO Model Server is quick and does not require any additional steps described in the section on running inference.
 
diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst
index 6c85473502ff9b..6348ca897c5ea5 100644
--- a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst
@@ -1,5 +1,5 @@
-Weight Compression
-==================
+LLM Weight Compression
+=========================
 
 .. toctree::
    :maxdepth: 1
@@ -187,7 +187,7 @@ trade-offs after optimization:
       ratio=0.9,
     )
 
-* ``scale_estimation`` - boolean parameter that enables more accurate estimation of 
+* ``scale_estimation`` - boolean parameter that enables more accurate estimation of
   quantization scales. Especially helpful when the weights of all layers are quantized to
   4 bits. Requires dataset.
 
diff --git a/docs/articles_en/openvino-workflow/model-optimization.rst b/docs/articles_en/openvino-workflow/model-optimization.rst
index b4b6cc64acb21b..f5a5f97341e960 100644
--- a/docs/articles_en/openvino-workflow/model-optimization.rst
+++ b/docs/articles_en/openvino-workflow/model-optimization.rst
@@ -22,7 +22,7 @@ It is a `set of compression algorithms <https://github.com/openvinotoolkit/nncf/
 organized as a Python package, that make your models smaller and faster. Note that NNCF
 is **not part of the OpenVINO package**, so it needs to be installed separately. It supports
 models in **PyTorch**, **TensorFlow** , **ONNX**, and **OpenVINO IR** formats, offering
-the following optimizations:
+the following main optimizations:
 
 .. image:: ../assets/images/WHAT_TO_USE.svg
 
@@ -42,20 +42,41 @@ the following optimizations:
        as Quantization-aware Training. This kind of optimization requires the use of the model's
        original framework, for NNCF, it is either PyTorch or TensorFlow.
 
-A common approach is to perform post-training quantization first, as it is the easiest option.
-If the result proves unsatisfactory, quantization-aware training will give you higher accuracy
-with the same level of performance boost. For the most performant product, adding filter pruning
-will further streamline the model.
 
-To learn about the full scope of the framework, its installation, and technical details, visit
-both `the NNCF repository <https://github.com/openvinotoolkit/nncf?tab=readme-ov-file>`__ and
-`NNCF API documentation <https://openvinotoolkit.github.io/nncf/autoapi/nncf/>`__.
+
+Recommended workflows
+##########################
+
+* A common approach for most cases is to:
+
+  1. Perform post-training quantization first, as it is the easiest option.
+  2. For even better results, combine post-training quantization with filter pruning.
+  3. If the accuracy drop is unacceptable, use quantization-aware training instead. It will give
+     you the same level of performance boost, with a smaller impact on accuracy.
+
+* **Weight compression** works **only with LLMs**. Do not try to use it with other models.
+* For **visual-multimodal** use cases, the encoder / decoder split approach may be recommended.
+
+
+
+
+
 
 
 
 .. image:: ../assets/images/DEVELOPMENT_FLOW_V3_crunch.svg
 
 
+
+Installation and usage
+###########################
+
+To learn about the full scope of the framework, its installation, and technical details, visit
+both `the NNCF repository <https://github.com/openvinotoolkit/nncf?tab=readme-ov-file>`__ and
+`NNCF API documentation <https://openvinotoolkit.github.io/nncf/autoapi/nncf/>`__.
+
+
+
 .. tab-set::
 
    .. tab-item:: Installation
diff --git a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst
index 249fc8c4884cc1..86788b20249a3f 100644
--- a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst
@@ -140,4 +140,4 @@ sequences.
 You can find more examples demonstrating how to work with states in other articles:
 
 * `LLM Chatbot notebook <../../notebooks/stable-zephyr-3b-chatbot-with-output.html>`__
-* :doc:`Serving Stateful Models with OpenVINO Model Server <../../ovms_docs_stateful_models>`
+* :doc:`Serving Stateful Models with OpenVINO Model Server <../../openvino-workflow/model-server/ovms_docs_stateful_models>`
diff --git a/docs/documentation_build_instructions.md b/docs/documentation_build_instructions.md
index 490da1b1029bd3..d9219454b86a19 100644
--- a/docs/documentation_build_instructions.md
+++ b/docs/documentation_build_instructions.md
@@ -45,5 +45,4 @@ Depending on the needs, following variables can be added to first cmake call:
 - building C/C++ API:  `-DENABLE_CPP_API=ON`
 - building Python API: `-DENABLE_PYTHON_API=ON`
 - building Notebooks:  `-DENABLE_NOTEBOOKS=ON`
-- building OMZ:        `-DENABLE_OMZ=ON`
 - building OVMS:       `-DENABLE_OVMS=ON -DOVMS_DOCS_DIR=<path_to_OVMS_repo>`
diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-Performance-Data.xlsx b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-Performance-Data.xlsx
index 057e132d384167..9b53d90e0862db 100644
Binary files a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-Performance-Data.xlsx and b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-Performance-Data.xlsx differ
diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-platform_list.pdf b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-platform_list.pdf
index 6782dea6d0f3b2..cba78e5244acf1 100644
Binary files a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-platform_list.pdf and b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-platform_list.pdf differ
diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-system-info-detailed.xlsx b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-system-info-detailed.xlsx
index 7c4d9cd0e40919..4e243b8190c876 100644
Binary files a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-system-info-detailed.xlsx and b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-system-info-detailed.xlsx differ
diff --git a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms.json b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms.json
index 8457e2be7a6f4a..18a36073d582f5 100644
--- a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms.json
+++ b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms.json
@@ -1,1102 +1,1047 @@
 [
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "bert-base-cased",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 479.649,
-            "int8": 482.878,
-            "ovmsfp32": 180.7,
-            "fp32": 179.541
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "bert-base-cased",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 159.534,
+                      "fp32_ovms": 157.334,
+                      "int8_ov": 432.339,
+                      "int8_ovms": 420.793
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "bert-base-cased",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 428.173,
-            "int8": 430.397,
-            "ovmsfp32": 156.73,
-            "fp32": 159.276
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "bert-large-uncased-whole-word-masking-squad-0001",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 13.125,
+                      "fp32_ovms": 13.254,
+                      "int8_ov": 38.151,
+                      "int8_ovms": 37.623
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "bert-base-cased",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 100.783,
-            "int8": 101.983,
-            "ovmsfp32": 35.711,
-            "fp32": 36.35
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "efficientdet-d0",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 247.445,
+                      "fp32_ovms": 253.09,
+                      "int8_ov": 413.083,
+                      "int8_ovms": 377.844
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "bert-base-cased",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 98.441,
-            "int8": 102.62,
-            "ovmsfp32": 34.303,
-            "fp32": 36.096
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "mask_rcnn_resnet50_atrous_coco",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 1.638,
+                      "fp32_ovms": 1.714,
+                      "int8_ov": 6.202,
+                      "int8_ovms": 6.126
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "bert-base-cased",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 26.185,
-            "int8": 26.436,
-            "ovmsfp32": 17.108,
-            "fp32": 17.395
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "mobilenet-v2",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 3333.399,
+                      "fp32_ovms": 2905.171,
+                      "int8_ov": 10422.241,
+                      "int8_ovms": 7461.99
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "bert-large-uncased",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 41.872,
-            "int8": 42.401,
-            "ovmsfp32": 14.949,
-            "fp32": 14.473
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "resnet-50",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 575.208,
+                      "fp32_ovms": 569.925,
+                      "int8_ov": 2199.072,
+                      "int8_ovms": 2064.581
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "bert-large-uncased",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 37.05,
-            "int8": 37.864,
-            "ovmsfp32": 13.075,
-            "fp32": 13.031
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "ssd-resnet34-1200",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 10.598,
+                      "fp32_ovms": 10.472,
+                      "int8_ov": 40.683,
+                      "int8_ovms": 38.737
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "bert-large-uncased",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 10.047,
-            "int8": 10.111,
-            "ovmsfp32": 3.259,
-            "fp32": 3.237
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "ssd_mobilenet_v1_coco",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 1219.441,
+                      "fp32_ovms": 1201.096,
+                      "int8_ov": 4400.471,
+                      "int8_ovms": 4270.702
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "bert-large-uncased",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 9.961,
-            "int8": 10.167,
-            "ovmsfp32": 3.236,
-            "fp32": 3.224
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "unet-camvid-onnx-0001",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 15.924,
+                      "fp32_ovms": 15.763,
+                      "int8_ov": 67.731,
+                      "int8_ovms": 64.658
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "bert-large-uncased",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 2.43,
-            "int8": 2.427,
-            "ovmsfp32": 1.447,
-            "fp32": 1.428
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "yolo_v5m",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 74.189,
+                      "fp32_ovms": 68.788,
+                      "int8_ov": 247.757,
+                      "int8_ovms": 180.302
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "Efficientdet-D0",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 439.435,
-            "int8": 485.287,
-            "ovmsfp32": 274.772,
-            "fp32": 272.856
+      "Platform": "Intel® Xeon® Gold 6238M",
+      "Model": "yolo_v8n",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 332.326,
+                      "fp32_ovms": 278.054,
+                      "int8_ov": 740.985,
+                      "int8_ovms": 609.062
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "Efficientdet-D0",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 376.1,
-            "int8": 415.275,
-            "ovmsfp32": 253.829,
-            "fp32": 259.188
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "bert-base-cased",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 182.025,
+                      "fp32_ovms": 180.764,
+                      "int8_ov": 485.82,
+                      "int8_ovms": 472.842
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "Efficientdet-D0",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 131.735,
-            "int8": 148.558,
-            "ovmsfp32": 57.036,
-            "fp32": 59.907
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "bert-large-uncased-whole-word-masking-squad-0001",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 14.625,
+                      "fp32_ovms": 15.132,
+                      "int8_ov": 42.906,
+                      "int8_ovms": 42.406
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "Efficientdet-D0",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 119.798,
-            "int8": 140.129,
-            "ovmsfp32": "",
-            "fp32": ""
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "efficientdet-d0",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 288.531,
+                      "fp32_ovms": 278.548,
+                      "int8_ov": 483.438,
+                      "int8_ovms": 443.032
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "Efficientdet-D0",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 47.382,
-            "int8": 50.573,
-            "ovmsfp32": 30.226,
-            "fp32": 31.492
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "mask_rcnn_resnet50_atrous_coco",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 1.872,
+                      "fp32_ovms": 1.95,
+                      "int8_ov": 6.856,
+                      "int8_ovms": 6.763
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "mask_rcnn_resnet50_atrous_coco",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 6.306,
-            "int8": 6.364,
-            "ovmsfp32": 1.96,
-            "fp32": 1.868
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "mobilenet-v2",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 3909.405,
+                      "fp32_ovms": 3327.621,
+                      "int8_ov": 12375.018,
+                      "int8_ovms": 7554.235
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "mask_rcnn_resnet50_atrous_coco",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 5.652,
-            "int8": 5.771,
-            "ovmsfp32": 1.714,
-            "fp32": 1.639
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "resnet-50",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 634.732,
+                      "fp32_ovms": 634.102,
+                      "int8_ov": 2481.256,
+                      "int8_ovms": 2349.872
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "mask_rcnn_resnet50_atrous_coco",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 1.309,
-            "int8": 1.267,
-            "ovmsfp32": 0.396,
-            "fp32": 0.371
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "ssd-resnet34-1200",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 12.166,
+                      "fp32_ovms": 12.027,
+                      "int8_ov": 47.295,
+                      "int8_ovms": 44.525
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "mask_rcnn_resnet50_atrous_coco",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 1.293,
-            "int8": 1.271,
-            "ovmsfp32": 0.355,
-            "fp32": 0.346
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "ssd_mobilenet_v1_coco",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 1384.145,
+                      "fp32_ovms": 1356.126,
+                      "int8_ov": 5037.197,
+                      "int8_ovms": 4834.045
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "mask_rcnn_resnet50_atrous_coco",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 0.38,
-            "int8": 0.352,
-            "ovmsfp32": 0.182,
-            "fp32": 0.151
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "unet-camvid-onnx-0001",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 18.26,
+                      "fp32_ovms": 18.052,
+                      "int8_ov": 77.933,
+                      "int8_ovms": 73.527
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "Mobilenet-V2",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 7563.199,
-            "int8": 12406.597,
-            "ovmsfp32": 3336.015,
-            "fp32": 3972.673
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "yolo_v5m",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 85.149,
+                      "fp32_ovms": 78.205,
+                      "int8_ov": 281.889,
+                      "int8_ovms": 204.353
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "Mobilenet-V2",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 7475.62,
-            "int8": 10373.146,
-            "ovmsfp32": 2934.976,
-            "fp32": 3381.725
+      "Platform": "Intel® Xeon® Platinum 8260M",
+      "Model": "yolo_v8n",
+      "PlatformType": "Server Platforms (Intel® Xeon®)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 376.079,
+                      "fp32_ovms": 312.181,
+                      "int8_ov": 801.556,
+                      "int8_ovms": 678.929
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "Mobilenet-V2",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 2158.818,
-            "int8": 2742.363,
-            "ovmsfp32": 740.988,
-            "fp32": 874.037
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "bert-base-cased",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 35.915,
+                      "fp32_ovms": 34.381,
+                      "int8_ov": 101.976,
+                      "int8_ovms": 99.024
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "Mobilenet-V2",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 2042.633,
-            "int8": 2809.471,
-            "ovmsfp32": 631.59,
-            "fp32": 759.984
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "bert-large-uncased-whole-word-masking-squad-0001",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 3.232,
+                      "fp32_ovms": 3.266,
+                      "int8_ov": 10.132,
+                      "int8_ovms": 10.133
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "Mobilenet-V2",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 614.174,
-            "int8": 718.416,
-            "ovmsfp32": 381.882,
-            "fp32": 455.793
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "efficientdet-d0",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 51.747,
+                      "fp32_ovms": 48.906,
+                      "int8_ov": 142.489,
+                      "int8_ovms": 124.167
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "Resnet-50",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 2356.238,
-            "int8": 2483.3,
-            "ovmsfp32": 628.616,
-            "fp32": 635.411
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "mask_rcnn_resnet50_atrous_coco",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 0.352,
+                      "fp32_ovms": 0.364,
+                      "int8_ov": 1.322,
+                      "int8_ovms": 1.336
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "Resnet-50",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 2071.836,
-            "int8": 2202.317,
-            "ovmsfp32": 568.945,
-            "fp32": 575.057
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "mobilenet-v2",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 795.18,
+                      "fp32_ovms": 664.842,
+                      "int8_ov": 2721.454,
+                      "int8_ovms": 2063.761
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "Resnet-50",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 440.533,
-            "int8": 458.665,
-            "ovmsfp32": 113.442,
-            "fp32": 116.116
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "resnet-50",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 114.859,
+                      "fp32_ovms": 110.835,
+                      "int8_ov": 467.591,
+                      "int8_ovms": 445.408
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "Resnet-50",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 441.7,
-            "int8": 469.848,
-            "ovmsfp32": 107.395,
-            "fp32": 113.605
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "ssd-resnet34-1200",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 2.053,
+                      "fp32_ovms": 2.074,
+                      "int8_ov": 8.023,
+                      "int8_ovms": 7.987
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "Resnet-50",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 114.045,
-            "int8": 118.024,
-            "ovmsfp32": 57.165,
-            "fp32": 58.366
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "ssd_mobilenet_v1_coco",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 260.104,
+                      "fp32_ovms": 250.094,
+                      "int8_ov": 991.064,
+                      "int8_ovms": 930.128
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "SSD-Resnet34-1200",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 44.499,
-            "int8": 47.251,
-            "ovmsfp32": 12.074,
-            "fp32": 12.167
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "unet-camvid-onnx-0001",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 3.273,
+                      "fp32_ovms": 3.3,
+                      "int8_ov": 12.884,
+                      "int8_ovms": 12.727
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "SSD-Resnet34-1200",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 38.714,
-            "int8": 40.662,
-            "ovmsfp32": 10.504,
-            "fp32": 10.653
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "yolo_v5m",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 14.714,
+                      "fp32_ovms": 14.243,
+                      "int8_ov": 55.058,
+                      "int8_ovms": 47.548
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "SSD-Resnet34-1200",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 7.756,
-            "int8": 7.818,
-            "ovmsfp32": 2.029,
-            "fp32": 2.005
+      "Platform": "Intel® Core™ i7-11700K",
+      "Model": "yolo_v8n",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 71.446,
+                      "fp32_ovms": 64.775,
+                      "int8_ov": 200.864,
+                      "int8_ovms": 144.792
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "SSD-Resnet34-1200",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 7.929,
-            "int8": 8.032,
-            "ovmsfp32": 2.072,
-            "fp32": 2.054
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "bert-base-cased",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 36.227,
+                      "fp32_ovms": 35.646,
+                      "int8_ov": 101.562,
+                      "int8_ovms": 100.382
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "SSD-Resnet34-1200",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 1.947,
-            "int8": 1.937,
-            "ovmsfp32": 1.037,
-            "fp32": 1.008
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "bert-large-uncased-whole-word-masking-squad-0001",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 3.23,
+                      "fp32_ovms": 3.254,
+                      "int8_ov": 10.05,
+                      "int8_ovms": 10.092
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "SSD_Mobilenet_V1_Coco",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 4732.691,
-            "int8": 4875.291,
-            "ovmsfp32": 1362.268,
-            "fp32": 1375.237
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "efficientdet-d0",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 59.759,
+                      "fp32_ovms": 55.851,
+                      "int8_ov": 149.505,
+                      "int8_ovms": 131.453
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "SSD_Mobilenet_V1_Coco",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 4168.575,
-            "int8": 4279.825,
-            "ovmsfp32": 1199.883,
-            "fp32": 1226.189
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "mask_rcnn_resnet50_atrous_coco",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 0.368,
+                      "fp32_ovms": 0.394,
+                      "int8_ov": 1.308,
+                      "int8_ovms": 1.338
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "SSD_Mobilenet_V1_Coco",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 921.041,
-            "int8": 1001.672,
-            "ovmsfp32": 268.066,
-            "fp32": 280.987
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "mobilenet-v2",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 865.806,
+                      "fp32_ovms": 734.822,
+                      "int8_ov": 2743.201,
+                      "int8_ovms": 2163.412
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "SSD_Mobilenet_V1_Coco",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 915.4,
-            "int8": 1028.233,
-            "ovmsfp32": 244.534,
-            "fp32": 260.822
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "resnet-50",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 116.784,
+                      "fp32_ovms": 113.046,
+                      "int8_ov": 457.358,
+                      "int8_ovms": 440.924
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "SSD_Mobilenet_V1_Coco",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 256.018,
-            "int8": 266.401,
-            "ovmsfp32": 129.917,
-            "fp32": 135.312
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "ssd-resnet34-1200",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 2.006,
+                      "fp32_ovms": 2.031,
+                      "int8_ov": 7.817,
+                      "int8_ovms": 7.75
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "Unet-Camvid--0001",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 73.429,
-            "int8": 77.693,
-            "ovmsfp32": 18.104,
-            "fp32": 17.938
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "ssd_mobilenet_v1_coco",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 274.42,
+                      "fp32_ovms": 264.153,
+                      "int8_ov": 997.987,
+                      "int8_ovms": 915.681
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "Unet-Camvid--0001",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 64.29,
-            "int8": 67.517,
-            "ovmsfp32": 15.777,
-            "fp32": 15.927
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "unet-camvid-onnx-0001",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 3.246,
+                      "fp32_ovms": 3.272,
+                      "int8_ov": 12.668,
+                      "int8_ovms": 12.585
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "Unet-Camvid--0001",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 12.574,
-            "int8": 12.628,
-            "ovmsfp32": 3.267,
-            "fp32": 3.253
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "yolo_v5m",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 14.985,
+                      "fp32_ovms": 14.514,
+                      "int8_ov": 54.937,
+                      "int8_ovms": 47.767
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "Unet-Camvid--0001",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 12.718,
-            "int8": 12.881,
-            "ovmsfp32": 3.272,
-            "fp32": 3.297
+      "Platform": "Intel® Core™ i9-11900K",
+      "Model": "yolo_v8n",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 74.1,
+                      "fp32_ovms": 67.472,
+                      "int8_ov": 203.493,
+                      "int8_ovms": 151.175
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "Unet-Camvid--0001",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 2.995,
-            "int8": 2.976,
-            "ovmsfp32": 1.555,
-            "fp32": 1.53
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "bert-base-cased",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 17.054,
+                      "fp32_ovms": 17.124,
+                      "int8_ov": 26.043,
+                      "int8_ovms": 25.872
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "Yolo_V3_Tiny",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 1842.129,
-            "int8": 2317.052,
-            "ovmsfp32": 755.451,
-            "fp32": 777.681
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "bert-large-uncased-whole-word-masking-squad-0001",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 1.434,
+                      "fp32_ovms": 1.456,
+                      "int8_ov": 2.421,
+                      "int8_ovms": 2.450
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "Yolo_V3_Tiny",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 1667.812,
-            "int8": 2056.27,
-            "ovmsfp32": 675.447,
-            "fp32": 704.412
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "efficientdet-d0",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 31.321,
+                      "fp32_ovms": 30.316,
+                      "int8_ov": 50.629,
+                      "int8_ovms": 47.377
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "Yolo_V3_Tiny",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 431.387,
-            "int8": 504.093,
-            "ovmsfp32": 145.92,
-            "fp32": 151.499
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "mask_rcnn_resnet50_atrous_coco",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 0.151,
+                      "fp32_ovms": 0.182,
+                      "int8_ov": 0.361,
+                      "int8_ovms": 0.389
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "Yolo_V3_Tiny",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 409.268,
-            "int8": 516.794,
-            "ovmsfp32": 139.903,
-            "fp32": 147.235
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "mobilenet-v2",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 442.763,
+                      "fp32_ovms": 380.661,
+                      "int8_ov": 724.232,
+                      "int8_ovms": 617.393
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "Yolo_V3_Tiny",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 117.276,
-            "int8": 133.578,
-            "ovmsfp32": 65.341,
-            "fp32": 69.29
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "resnet-50",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 57.978,
+                      "fp32_ovms": 57.038,
+                      "int8_ov": 118.213,
+                      "int8_ovms": 113.691
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® 8260M CPU-only",
-    "Model": "Yolo_V8n",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": "",
-            "int8": "",
-            "ovmsfp32": 314.652,
-            "fp32": 386.299
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "ssd-resnet34-1200",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 1.0,
+                      "fp32_ovms": 1.031,
+                      "int8_ov": 1.937,
+                      "int8_ovms": 1.954
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Xeon® Gold 6238M CPU-only",
-    "Model": "Yolo_V8n",
-    "Checked": true,
-    "PlatformType": "Mobile Platforms (Intel® Atom™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": "",
-            "int8": "",
-            "ovmsfp32": 282.302,
-            "fp32": 340.845
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "ssd_mobilenet_v1_coco",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 133.421,
+                      "fp32_ovms": 129.949,
+                      "int8_ov": 267.141,
+                      "int8_ovms": 256.821
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i9-11900K CPU-only",
-    "Model": "Yolo_V8n",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 153.817,
-            "int8": 204.691,
-            "ovmsfp32": 67.421,
-            "fp32": 74.996
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "unet-camvid-onnx-0001",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 1.515,
+                      "fp32_ovms": 1.534,
+                      "int8_ov": 2.96,
+                      "int8_ovms": 2.973
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i7-11700K CPU-only",
-    "Model": "Yolo_V8n",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 143.19,
-            "int8": 197.409,
-            "ovmsfp32": 62.948,
-            "fp32": 70.913
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "yolo_v5m",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 7.691,
+                      "fp32_ovms": 7.511,
+                      "int8_ov": 14.919,
+                      "int8_ovms": 13.832
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   },
   {
-    "Platform": "Intel® Core™  i3-10100 CPU-only",
-    "Model": "Yolo_V8n",
-    "Checked": true,
-    "PlatformType": "Client Platforms (Intel® Core™)",
-    "Parameters": {
-      "throughput": {
-        "Precisions": [
-          {
-            "ovmsint8": 56.244,
-            "int8": 67.968,
-            "ovmsfp32": 34.396,
-            "fp32": 38.576
+      "Platform": "Intel® Core™ i3-10100",
+      "Model": "yolo_v8n",
+      "PlatformType": "Client Platforms (Intel® Core™)",
+      "Parameters": {
+          "throughput": {
+              "Precisions": [
+                  {
+                      "fp32_ov": 38.482,
+                      "fp32_ovms": 34.513,
+                      "int8_ov": 68.126,
+                      "int8_ovms": 55.698
+                  }
+              ],
+              "Unit": "FPS",
+              "UnitDesc": "higher is better"
           }
-        ],
-        "Unit": "FPS",
-        "UnitDesc": "higher is better"
       }
-    }
   }
 ]
\ No newline at end of file
diff --git a/docs/sphinx_setup/_static/benchmarks_files/graph-config.json b/docs/sphinx_setup/_static/benchmarks_files/graph-config.json
index df371e23c8e6eb..6fb8d19e1a1adf 100644
--- a/docs/sphinx_setup/_static/benchmarks_files/graph-config.json
+++ b/docs/sphinx_setup/_static/benchmarks_files/graph-config.json
@@ -1,13 +1,15 @@
 
 {
     "PrecisionsMap": {
-        "OVMSINT8": "ovmsint8",
-        "OVMSFP32": "ovmsfp32",
         "INT4": "int4",
         "INT8": "int8",
         "FP16": "fp16",
         "FP32": "fp32",
-        "BF16": "bf16"
+        "BF16": "bf16",
+        "FP32_OV": "fp32_ov",
+        "FP32_OVMS": "fp32_ovms",
+        "INT8_OV": "int8_ov",
+        "INT8_OVMS": "int8_ovms"
     },
     "ParametersMap": {
         "Throughput": "throughput",
@@ -27,7 +29,7 @@
     "PrecisionData": {
         "int4": {
             "data": null,
-            "color": "#5bd0f0",
+            "color": "#76D8F6",
             "label": "INT4"
         },
         "int8": {
@@ -50,15 +52,25 @@
             "color": "#00536a",
             "label": "BF16"
         },
-        "ovmsint8": {
+        "fp32_ov": {
+            "data": null,
+            "color": "#76D8F6",
+            "label": "FP32 OV"
+        },
+        "fp32_ovms": {
+            "data": null,
+            "color": "#00C7FD",
+            "label": "FP32 OVMS"
+        },
+        "int8_ov": {
             "data": null,
             "color": "#009fca",
-            "label": "FPS( OV Ref. INT8)"
+            "label": "INT8 OV"
         },
-        "ovmsfp32": {
+        "int8_ovms": {
             "data": null,
             "color": "#00536a",
-            "label": "BF16"
+            "label": "INT8 OVMS"
         }
     },
     "Filters": [
diff --git a/scripts/setupvars/setupvars.bat b/scripts/setupvars/setupvars.bat
index fac3e7f66c4ed4..8a09d974ecb295 100644
--- a/scripts/setupvars/setupvars.bat
+++ b/scripts/setupvars/setupvars.bat
@@ -67,7 +67,7 @@ set "PATH=%OPENVINO_LIB_PATHS%;%PATH%"
 
 :: Check if Python is installed
 set PYTHON_VERSION_MAJOR=3
-set MIN_REQUIRED_PYTHON_VERSION_MINOR=8
+set MIN_REQUIRED_PYTHON_VERSION_MINOR=9
 set MAX_SUPPORTED_PYTHON_VERSION_MINOR=13
 
 python --version 2>NUL
diff --git a/scripts/setupvars/setupvars.ps1 b/scripts/setupvars/setupvars.ps1
index 7dacef5df4306b..2f0f960c1a08e3 100644
--- a/scripts/setupvars/setupvars.ps1
+++ b/scripts/setupvars/setupvars.ps1
@@ -63,7 +63,7 @@ Write-Host "[setupvars] OpenVINO environment initialized"
 
 # Check if Python is installed
 $PYTHON_VERSION_MAJOR = 3
-$MIN_REQUIRED_PYTHON_VERSION_MINOR = 8
+$MIN_REQUIRED_PYTHON_VERSION_MINOR = 9
 $MAX_SUPPORTED_PYTHON_VERSION_MINOR = 13
 
 try
diff --git a/scripts/setupvars/setupvars.sh b/scripts/setupvars/setupvars.sh
index 3b4fb9407f9090..422bc4a035dd8b 100755
--- a/scripts/setupvars/setupvars.sh
+++ b/scripts/setupvars/setupvars.sh
@@ -100,7 +100,7 @@ if command -v lsb_release >/dev/null 2>&1; then
 fi
 
 PYTHON_VERSION_MAJOR="3"
-MIN_REQUIRED_PYTHON_VERSION_MINOR="8"
+MIN_REQUIRED_PYTHON_VERSION_MINOR="9"
 MAX_SUPPORTED_PYTHON_VERSION_MINOR="13"
 
 check_python_version () {
diff --git a/src/bindings/js/docs/CODESTYLE.md b/src/bindings/js/docs/CODESTYLE.md
index 0ebfd322767b57..2441663d6cc424 100644
--- a/src/bindings/js/docs/CODESTYLE.md
+++ b/src/bindings/js/docs/CODESTYLE.md
@@ -1,9 +1,14 @@
 # Code Style Guide
 
+Node.js bindings contain two parts: C++ and Typescript/JavaScript.
+
 This article presents the coding standards for JavaScript and TypeScript parts of **openvino-node** package. The following rules will help maintain code quality and consistency throughout the codebase.
 
+For C++ codestyle rules, refer to [this document](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/coding_style.md).
+
 Make sure your IDE has ESLint plugin installed. Its rules are specified in the [.eslint-global.js file](../.eslintrc-global.js). Keep in mind that your PR will not be approved if it does not meet the following requirements.
 
+
 ## General Rules
 
 ### 1. Semicolons
@@ -89,6 +94,7 @@ Make sure your IDE has ESLint plugin installed. Its rules are specified in the [
   - Special case for the `catch` keyword: No space after `catch`
 - **Enforced By**: `keyword-spacing: ['error', { overrides: { catch: { after: false } } }]`
 
+
 ## Additional Resources
 
 For further details on each rule, refer to the [ESLint documentation](https://eslint.org/docs/rules/).
diff --git a/src/bindings/js/docs/README.md b/src/bindings/js/docs/README.md
index bada676878847f..f0c70cf4dd9aed 100644
--- a/src/bindings/js/docs/README.md
+++ b/src/bindings/js/docs/README.md
@@ -2,10 +2,10 @@
 
 ## Folders
 
-- `./docs` - documentation
-- `./node` - openvino-node npm package
+- [./docs](../docs/) - documentation
+- [./node](../node/) - openvino-node npm package
 
-## openvino-node Package Developer Documentation
+## `openvino-node` Package Developer Documentation
 
 ### Components
 
@@ -28,7 +28,6 @@
   ```bash
   cmake \
     -DCMAKE_BUILD_TYPE=Release \
-    -DENABLE_FASTER_BUILD=ON \
     -DCPACK_GENERATOR=NPM \
     -DENABLE_SYSTEM_TBB=OFF -UTBB* \
     -DENABLE_TESTS=OFF \
@@ -75,9 +74,9 @@
 
 [OpenVINO™ Node.js Bindings Examples of Usage](../../../../samples/js/node/README.md)
 
-## Contribution
+## Contributing
 
-If you want to contribute to the project, refer to the [code style rules](./CODESTYLE.md) and [contribution guide](../../../../CONTRIBUTING.md) first.
+Your contributions are welcome! Make sure to read the [Contribution Guide](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/node/CONTRIBUTING.md) to learn how you can get involved.
 
 ## See Also
 
diff --git a/src/bindings/js/docs/code_examples.md b/src/bindings/js/docs/code_examples.md
index 13bfa14812d54b..08d92e7307dbfe 100644
--- a/src/bindings/js/docs/code_examples.md
+++ b/src/bindings/js/docs/code_examples.md
@@ -1,22 +1,24 @@
 # How to extend the OpenVINO™ JavaScript API code
 
-## Build the OpenVINO™ JavaScript API 
+## Build the OpenVINO™ JavaScript API
+
 For detailed build instructions, refer to the [OpenVINO™ JavaScript API documentation](./README.md).
 
+
 ## Project's naming conventions
+
 When implementing the C++ sources for the JavaScript API, it is essential to adhere to the OpenVINO naming conventions described in the [OpenVINO Coding Style Guide](../../../../docs/dev/coding_style.md). In summary, the naming style employs `Snake Case` for methods, functions, and variables, while `Camel Case` is used for class names. Additionally, the naming of entities in the C++ sources should closely mirror their equivalents in the C++ API to maintain consistency.
 
 For methods that are exposed to JavaScript, the naming convention transitions to `Camel Case`, aligning with common JavaScript practices. As an example, a method in the C++ API named `get_element_type` would be represented in the JavaScript API as `getElementType()`.
 
+
 ## node-addon-api module
 
 [node addon api](https://github.com/nodejs/node-addon-api) is used to create OpenVINO JavaScript API for Node.js. The quickest way to learn is to follow the official [examples](https://github.com/nodejs/node-addon-examples). It is recommended to check out the tutorial on [how to create a JavaScript object from a C++ object](https://github.com/nodejs/node-addon-examples/tree/main/src/2-js-to-native-conversion/object-wrap-demo/node-addon-api).
 
 
-
-
-
 ## Adding a new class and method
+
 To introduce a new `MyTensor` class that interacts with the `ov::Tensor` class, follow these steps:
  - The class should facilitate construction from an ov::Tensor instance and allow initialization from a JavaScript element type and shape.
  - It should also provide a getElementType method that retrieves the ov::Tensor element type.
@@ -25,7 +27,7 @@ Begin by creating a header file for the `MyTensor` class in the OpenVINO reposit
 ```cpp
 class MyTensor : public Napi::ObjectWrap<MyTensor> {
 public:
-    // Constructor for the wrapper class 
+    // Constructor for the wrapper class
     MyTensor(const Napi::CallbackInfo& info);
 
     // It returns a JavaScript class definition
@@ -75,12 +77,15 @@ add_library(${PROJECT_NAME} SHARED
 )
 ```
 
+
 ### Argument validation and conversion
 
 When binding JavaScript arguments with C++ functions, it is crucial to validate and convert the arguments appropriately. The template `ov::js::validate` function is a utility that facilitates this process. It is particularly useful for handling different overloads of functions and ensuring standardized error messages when arguments do not match expected signatures.
 Before implementing a new conversion function, such as `js_to_cpp<ov::Shape>`, review the existing [helper methods](../../node/include/helper.hpp) to see if one already meets your requirements.
 
+
 ### New class initialization
+
 When a new class is introduced to the `openvino-node` module, it must be initialized upon module loading. This is done in the [addon.cpp](../../src/addon.cpp) file. The initialization process registers the class with the Node.js environment so that it can be used within JavaScript code.
 ```cpp
 Napi::Object init_module(Napi::Env env, Napi::Object exports) {
@@ -100,6 +105,7 @@ struct AddonData {
 ```
 
 ### Document the new functionality
+
 The last step is to add the TypeScript type definitions and describe the new functionality.
 ```typescript
 /**
@@ -132,9 +138,9 @@ export interface NodeAddon {
 
 Now that coding is finished, remember to rebuild the project and test it out.
 
-To learn how to test your code, refer to the guide on [how to test OpenVINO™ JavaScript API.](./test_examples.md) 
+To learn how to test your code, refer to the guide on [how to test OpenVINO™ JavaScript API.](./test_examples.md)
 
 ## See also
  * [OpenVINO™ README](../../../../README.md)
  * [OpenVINO™ bindings README](../../README.md)
- * [Developer documentation](../../../../docs/dev/index.md)
\ No newline at end of file
+ * [Developer documentation](../../../../docs/dev/index.md)
diff --git a/src/bindings/js/docs/test_examples.md b/src/bindings/js/docs/test_examples.md
index b8ff0c8ff7c9d0..0e75cb56f3a700 100644
--- a/src/bindings/js/docs/test_examples.md
+++ b/src/bindings/js/docs/test_examples.md
@@ -1,6 +1,6 @@
 # How to test the OpenVINO™ JavaScript API
 
-## Build the OpenVINO™ JavaScript API 
+## Build the OpenVINO™ JavaScript API
 For detailed build instructions, refer to the [OpenVINO™ JavaScript API documentation](./README.md).
 
 
@@ -17,14 +17,14 @@ npm run test
 
 To run specific test files, you can pass one or more glob patterns:
 ```shell
-node --test "tests/unit/core.test.js" "tests/unit/*model.test.js" 
+node --test "tests/unit/core.test.js" "tests/unit/*model.test.js"
 ```
 
 Before executing individual test files, a one-time setup is required. If you have not previously executed `npm run test`, initiate the setup by running the following command:
 
 ```shell
 npm run test_setup
-``` 
+```
 
 More information on running tests from the command line can be found in the [Node.js documentation]( https://nodejs.org/docs/latest/api/test.html#running-tests-from-the-command-line).
 
@@ -45,11 +45,11 @@ It is recommended to run the code style check each time new tests are added.
 
 
 ## Writing OpenVINO™ JavaScript API tests
+
 ### Before start
 Follow and complete [Examples of OpenVINO™ JavaScript API code](./code_examples.md).
 
 
-
 ### Adding new test-case in the correct place
 Each new test should verify the correct behavior of the new functionality (e.g. class, method).
 
@@ -57,7 +57,8 @@ Unit test files are located in the `<openvino_repo>/src/bindings/js/node/tests/u
 
 Always add tests to the correct locations and create new files only when necessary. *Remember to include the license on top of each new file*.
 
-### Test writing guidelines 
+
+### Test writing guidelines
 Each test file starts with a `describe` block to group all tests related to a specific class or module. The name of the `describe` block should match the name of the class or module being tested, for example *ov.Core tests*.
 
 Within the `describe` block, individual tests are defined using `test` or `it` blocks, with the name of the test reflecting what is being tested. If multiple tests relate to the same method, they can be grouped within a nested `describe` block.
diff --git a/src/bindings/js/node/CONTRIBUTING.md b/src/bindings/js/node/CONTRIBUTING.md
new file mode 100644
index 00000000000000..aacef418aeed2d
--- /dev/null
+++ b/src/bindings/js/node/CONTRIBUTING.md
@@ -0,0 +1,67 @@
+# Contributing to OpenVINO™ Node.js API
+
+Your commitment to this project is greatly appreciated and the following guide is intended to help you contribute.  
+
+Make sure to read [main contribution guide](https://github.com/openvinotoolkit/openvino/blob/master/CONTRIBUTING.md) first. It covers most topics related to contributing to OpenVINO.
+
+
+## TLDR
+
+1. Decide what you want to change.
+2. Create your fork of the OpenVINO repository.
+3. Create a branch with a meaningful name for your changes.
+4. Align the code style, commit the changes, and run tests.
+5. Create a Pull Request, which clearly describes what has been changed and why.
+6. Go through the Code Review.
+7. Get your awesome code merged!
+
+Read the section below for more details.
+
+
+## How to Decide What to Change
+
+In case of minor fixes, like changing variable names, additional parameter checks, etc., go to the next step.
+
+However, if you want to bring significant changes, for example, the extension of architecture or a big part of functionality, that involves a large amount
+of source code, open [an issue](https://github.com/openvinotoolkit/openvino/issues/new?assignees=octocat&labels=enhancement%2Cfeature&projects=&template=feature_request.yml&title=%5BFeature+Request%5D%3A+) first and discuss your idea with
+codeowners. It will prevent you from doing extra work.
+
+You can also take one of the well-described tasks from the [Good First Issue](https://github.com/orgs/openvinotoolkit/projects/3/views/14) section. It can be a great start to contributing with codeowners' support!
+
+
+## Let's code
+
+Get familiar with Node.js API architecture and code samples.
+Refer to the [guide](../docs/code_examples.md), which will help you understand the component structure and the code style.
+
+The environment setup and build instructions can be found in [Building the Node.js API](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/docs/README.md#openvino-node-package-developer-documentation).
+
+Run tests! If you add a new functionality, make sure that it is covered by tests first.
+Read [the guide](../docs/test_examples.md) for more details about the tests and their runs.
+Many CI checks will run after getting a Code Review. Make sure that
+all checks have passed. CI checks are composed of both functional tests and code-style checks and may fail because of warnings/errors in both stages.
+
+Remember to follow [our codestyle](../docs/CODESTYLE.md).
+By following the provided guide and using an automotive code style checking tool, like
+**eslint** and **clang-format-9**, you will save some time and help with the code review of proposed changes.
+
+
+## Description of the Pull Request
+
+Append all PR titles with the `[OV JS]` tag. Provide any relevant details in the description, as it will definitely help with the review. The minimum requirement is a compact, bulleted list of proposed changes.
+
+Use the following template:
+```
+*Describe what is the purpose of this PR*
+
+### Details:
+- *Describe your changes.*
+- ...
+
+```
+
+
+## License
+
+By contributing to the OpenVINO project, you agree that your contributions will be
+licensed under the terms of the [LICENSE](https://github.com/openvinotoolkit/openvino/blob/master/LICENSE).
diff --git a/src/bindings/js/node/README.md b/src/bindings/js/node/README.md
index e2c38f2a18e516..c927bd0b360ed4 100644
--- a/src/bindings/js/node/README.md
+++ b/src/bindings/js/node/README.md
@@ -1,8 +1,14 @@
 # OpenVINO™ Node.js Bindings
 
-Use OpenVINO JavaScript API for your Node.js application.
+Use OpenVINO to deploy deep learning models easily in Node.js applications.
 
-## Usage
+## Introduction
+
+OpenVINO™ is an open-source toolkit designed for high-performance deep learning inference.
+Node.js API provides bindings to subset APIs from OpenVINO Runtime.
+The Node.js bindings enable JavaScript developers to use the capabilities of OpenVINO in their applications.
+
+## Quick Start
 
 Install the **openvino-node** package:
 ```bash
@@ -14,15 +20,21 @@ Use the **openvino-node** package:
 const { addon: ov } = require('openvino-node');
 ```
 
+Refer to the complete description of the `addon` API in the [documentation](https://docs.openvino.ai/2024/api/nodejs_api/addon.html).
+
+See the [samples](https://github.com/openvinotoolkit/openvino/blob/master/samples/js/node/README.md) for more details on how to use it.
+
 ## Usage in Electron applications
 
 To use the package in development of Electron applications on Windows, make sure that
 **Desktop development with C++** component from
 [Build Tools for Visual Studio](https://aka.ms/vs/17/release/vs_BuildTools.exe) is installed.
 
-## Build From Sources
+## Supported Platforms
 
-For more details, refer to the [OpenVINO™ JavaScript API Developer Documentation](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/docs/README.md#openvino-node-package-developer-documentation)
+- Windows x86
+- Linux x86/ARM
+- MacOS x86/ARM
 
 ## Documentation & Samples
 
@@ -31,11 +43,19 @@ For more details, refer to the [OpenVINO™ JavaScript API Developer Documentati
 
 ## Live Sample
 
-You can run this sample in the browser; no installation is required.
+You can run the following sample in the browser, no installation is required.
 [Codesandbox](https://codesandbox.io/) is a free online service with limited resources. For optimal performance and more control,  it is recommended to run the sample locally.
 
 - [hello-classification-sample](https://codesandbox.io/p/devbox/openvino-node-hello-classification-sample-djl893)
 
+## Build From Sources
+
+For more details, refer to the [OpenVINO™ JavaScript API Developer Documentation](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/docs/README.md#openvino-node-package-developer-documentation)
+
+## Contributing
+
+Contributions are always welcome! Read the [Contribution Guide](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/node/CONTRIBUTING.md) to learn how you can get involved.
+
 ## See Also
 
 * [OpenVINO™ README](https://github.com/openvinotoolkit/openvino/blob/master/README.md)
diff --git a/src/bindings/js/node/package.json b/src/bindings/js/node/package.json
index d00633c93b062a..8bc6bbd4bb1d46 100644
--- a/src/bindings/js/node/package.json
+++ b/src/bindings/js/node/package.json
@@ -48,5 +48,8 @@
     "remote_path": "./repositories/openvino/nodejs_bindings/{version}/{platform}/",
     "package_name": "openvino_nodejs_bindings_{platform}_{version}_{arch}.tar.gz",
     "host": "https://storage.openvinotoolkit.org"
-  }
+  },
+  "keywords": [
+    "OpenVINO"
+  ]
 }
diff --git a/src/bindings/python/src/openvino/preprocess/torchvision/requirements.txt b/src/bindings/python/src/openvino/preprocess/torchvision/requirements.txt
index 23ba17d4918e71..201d5085bd1583 100644
--- a/src/bindings/python/src/openvino/preprocess/torchvision/requirements.txt
+++ b/src/bindings/python/src/openvino/preprocess/torchvision/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch>=1.13
-torchvision; platform_machine == 'arm64' and python_version >= '3.8'
+torchvision; platform_machine == 'arm64' and python_version >= '3.9'
 torchvision; platform_machine != 'arm64'
 pillow>=9.0
\ No newline at end of file
diff --git a/src/common/transformations/include/ov_ops/rotary_positional_embeddings.hpp b/src/common/transformations/include/ov_ops/rotary_positional_embeddings.hpp
index 5d839c19600340..dcb9aef187d2d9 100644
--- a/src/common/transformations/include/ov_ops/rotary_positional_embeddings.hpp
+++ b/src/common/transformations/include/ov_ops/rotary_positional_embeddings.hpp
@@ -27,6 +27,8 @@ class TRANSFORMATIONS_API RoPE : public Op {
         bool is_interleaved = false;   // interleaved mode, implies trans0213 happens after RoPE
         size_t rotary_ndims = 0;       // dimensions to be embedded (d in the description)
         bool is_chatglm = false;       // chatglm is special which overrides other setting
+        bool support_2d_rope = false;  // 2d rope mode, Support 2 dimentional rope which is independant of batch and
+                                       // each head. change input order to [batch, head_cnt, 4608] to support 2d rope
         bool is_qwen = false;          // Qwen is special which overrides other setting
         size_t head_cnt = 0;
         size_t head_size = 0;
diff --git a/src/common/transformations/include/transformations/common_optimizations/fuse_rotary_positional_embeddings.hpp b/src/common/transformations/include/transformations/common_optimizations/fuse_rotary_positional_embeddings.hpp
index 5cd99f88d13413..eb1c92bcf9607f 100644
--- a/src/common/transformations/include/transformations/common_optimizations/fuse_rotary_positional_embeddings.hpp
+++ b/src/common/transformations/include/transformations/common_optimizations/fuse_rotary_positional_embeddings.hpp
@@ -38,7 +38,7 @@ class ov::pass::RoPEFusionGPTJ : public ov::pass::MatcherPass {
 class ov::pass::RoPEFusionChatGLM : public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("RoPEFusionChatGLM", "0");
-    RoPEFusionChatGLM(int split_output_id);
+    RoPEFusionChatGLM(int split_output_id, const bool support_2d_rope = false);
 };
 
 class ov::pass::RoPEFusionQwen : public ov::pass::MatcherPass {
@@ -84,7 +84,7 @@ class ov::pass::RoPEShareCosSin : public ov::pass::MatcherPass {
 class ov::pass::RoPEFusion : public ov::pass::GraphRewrite {
 public:
     OPENVINO_RTTI("RoPEFusion", "0");
-    RoPEFusion() {
+    RoPEFusion(bool support_2d_rope = false) {
         add_matcher<ov::pass::RoPEFusionGPTNEOX>();
         add_matcher<ov::pass::RoPEFusionGPTJ>();
         // optional heads & tails are fused in separate matcher pass,
@@ -95,6 +95,10 @@ class ov::pass::RoPEFusion : public ov::pass::GraphRewrite {
 
         add_matcher<ov::pass::RoPEFusionChatGLM>(0);
         add_matcher<ov::pass::RoPEFusionChatGLM>(1);
+        if (support_2d_rope) {
+            add_matcher<ov::pass::RoPEFusionChatGLM>(0, true);
+            add_matcher<ov::pass::RoPEFusionChatGLM>(1, true);
+        }
 
         add_matcher<ov::pass::RoPEFusionQwen>(0);
         add_matcher<ov::pass::RoPEFusionQwen>(1);
diff --git a/src/common/transformations/src/ov_ops/rotary_positional_embeddings.cpp b/src/common/transformations/src/ov_ops/rotary_positional_embeddings.cpp
index 915adecda0af68..3e75e2b88df266 100644
--- a/src/common/transformations/src/ov_ops/rotary_positional_embeddings.cpp
+++ b/src/common/transformations/src/ov_ops/rotary_positional_embeddings.cpp
@@ -45,13 +45,27 @@ void RoPE::validate_and_infer_types() {
     }
 
     if (m_config.is_chatglm) {
-        // chatGLM specific RoPE
-        // input  [length, batch_size, (hidden_states_q + hidden_states_k + hidden_states_v)]
-        // output [length, batch_size, head_cnt, hidden_states_k]
-        set_output_type(
-            0,
-            get_input_element_type(0),
-            {input_pshape[0], input_pshape[1], ov::Dimension(m_config.head_cnt), ov::Dimension(m_config.head_size)});
+        if (m_config.support_2d_rope) {
+            // chatGLM specific RoPE
+            // input  [batch_size, length, (hidden_states_q + hidden_states_k + hidden_states_v)]
+            // output [batch_size, head_cnt, length, hidden_states_k]
+            set_output_type(0,
+                            get_input_element_type(0),
+                            {input_pshape[0],
+                             ov::Dimension(m_config.head_cnt),
+                             input_pshape[1],
+                             ov::Dimension(m_config.head_size)});
+        } else {
+            // chatGLM specific RoPE
+            // input  [length, batch_size, (hidden_states_q + hidden_states_k + hidden_states_v)]
+            // output [length, batch_size, head_cnt, hidden_states_k]
+            set_output_type(0,
+                            get_input_element_type(0),
+                            {input_pshape[0],
+                             input_pshape[1],
+                             ov::Dimension(m_config.head_cnt),
+                             ov::Dimension(m_config.head_size)});
+        }
         return;
     }
 
@@ -79,6 +93,7 @@ bool RoPE::visit_attributes(ov::AttributeVisitor& visitor) {
     visitor.on_attribute("is_interleaved", m_config.is_interleaved);
     visitor.on_attribute("rotary_ndims", m_config.rotary_ndims);
     visitor.on_attribute("is_chatglm", m_config.is_chatglm);
+    visitor.on_attribute("support_2d_rope", m_config.support_2d_rope);
     visitor.on_attribute("is_qwen", m_config.is_qwen);
     visitor.on_attribute("head_cnt", m_config.head_cnt);
     visitor.on_attribute("head_size", m_config.head_size);
diff --git a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
index b6c19a0a0391fd..143603f0415373 100644
--- a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
@@ -417,12 +417,16 @@ ov::pass::RoPEFusionGPTJ::RoPEFusionGPTJ() {
     this->register_matcher(m, callback);
 }
 
-ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id) {
+ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id, const bool support_2d_rope) {
     MATCHER_SCOPE(RoPEFusionChatGLM);
 
-    auto qkv_linear = makePattern("[?,?,?]");  //  [seq_length, batch_size, 4608]
+    //  [seq_length, batch_size, input_size(will be cropped to match hidden state size)]
+    //  [batch_size, seq_length, input_size] support_2d_rope
+    auto qkv_linear = makePattern("[?,?,?]");
     auto seq_length = makePattern("i32[1]");
-    auto cos_sin_cache = makePattern("[?,?,?,?]");  // [max_pos_embeddings, batch_size, 32, 2]
+    // [max_pos_embeddings, batch_size, half_rotary_dims, 2]
+    // [batch_size, max_pos_embeddings, half_rotary_dims, 2] support_2d_rope
+    auto cos_sin_cache = makePattern("[?,?,?,?]");
 
     auto ndims = ov::gen_pattern::Symbol("ndims");
     auto head_cnt = ov::gen_pattern::Symbol("head_cnt");
@@ -436,37 +440,76 @@ ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id) {
     auto qkv_proj = makePattern<opset1::VariadicSplit>({qkv_linear, -1, {total_size_q, total_size_k, total_size_v}});
     qkv_proj->set_output_size(3);
 
-    // get key [L, B, Hkv, S]
     auto cur_key = makePattern<opset1::Reshape>({qkv_proj->output(split_output_id), {0, 0, head_cnt, head_size}},
                                                 {{"special_zero", true}});
 
-    auto slice_Slice_437 = GenSlice(cur_key, 0, ndims, 1, 3);
-    auto var_split_1 = makePattern<opset1::VariadicSplit>({cur_key, 3, {ndims, ov::gen_pattern::Symbol("end")}});
+    std::shared_ptr<ov::Node> input_key = nullptr;
+    // Extended the RoPE to a two-dimensional form to accommodate the 2D positional encoding in GLM.
+    // Calculate positional embedding independent of batch and each head
+    if (support_2d_rope) {
+        // Get transposed key [batch, head_cnt, seq_length, head_size]
+        input_key = makePattern<opset1::Transpose>({cur_key, {0, 2, 1, 3}});
+    } else {
+        // Get key [seq_length, batch, head_cnt, head_size]
+        input_key = std::move(cur_key);
+    }
+
+    auto slice_Slice_437 = GenSlice(input_key, 0, ndims, 1, 3);
+    auto var_split_1 = makePattern<opset1::VariadicSplit>({input_key, 3, {ndims, ov::gen_pattern::Symbol("end")}});
     var_split_1->set_output_size(2);
 
     // rotate half
-    auto ListConstruct_452_Concat =
-        makePattern<opset1::Concat>({seq_length, {-1}, {head_cnt}, {ndims / 2}, {2}}, {{"axis", 0}});
-    auto const_target_shape_1 = makeConst({seq_len, batch, head_cnt, ndims / 2, 2});
-
-    auto ListConstruct_379_Concat =
-        makePattern<opset1::Concat>({seq_length, {-1}, {1}, {ndims / 2}, {2}}, {{"axis", 0}});
-    auto const_target_shape_2 = makeConst({seq_len, batch, 1, ndims / 2, 2});
-
-    auto reshape_Reshape_453 = makePattern<opset1::Reshape>(
-        {slice_Slice_437 | var_split_1->output(0), ListConstruct_452_Concat | const_target_shape_1});
+    std::shared_ptr<ov::Node> reshape_Reshape_453 = nullptr;
+    if (support_2d_rope) {
+        auto const_target_shape_1 = makeConst({0, head_cnt, 0, ndims / 2, 2});
+        reshape_Reshape_453 =
+            makePattern<opset1::Reshape>({slice_Slice_437 | var_split_1->output(0), const_target_shape_1},
+                                         {{"special_zero", true}});
+    } else {
+        auto ListConstruct_452_Concat =
+            makePattern<opset1::Concat>({seq_length, {-1}, {head_cnt}, {ndims / 2}, {2}}, {{"axis", 0}});
+        auto const_target_shape_1 = makeConst({seq_len, batch, head_cnt, ndims / 2, 2});
+        reshape_Reshape_453 = makePattern<opset1::Reshape>(
+            {slice_Slice_437 | var_split_1->output(0), ListConstruct_452_Concat | const_target_shape_1});
+    }
 
     auto x_even = makePattern<opset8::Gather>({reshape_Reshape_453, 0, -1}, {{"batch_dims", 0}});
     auto x_odd = makePattern<opset8::Gather>({reshape_Reshape_453, 1, -1}, {{"batch_dims", 0}});
-    auto slice_Slice_449 = makePattern<ov::opset8::Slice>({cos_sin_cache, {0}, seq_length, {1}, {0}});
-    auto slice_StridedSlice_449 = GenStridedSlice(cos_sin_cache, {0}, seq_length, {1}, 0);
+
     auto var_split_2 = makePattern<opset1::VariadicSplit>({cos_sin_cache, 0, {0, ov::gen_pattern::Symbol("end")}});
     var_split_2->set_output_size(2);
 
-    auto view_Reshape_460 =
-        makePattern<opset1::Reshape>({slice_StridedSlice_449 | slice_Slice_449 | var_split_2->output(0),
-                                      ListConstruct_379_Concat | const_target_shape_2},
-                                     {{"special_zero", false}});
+    std::shared_ptr<ov::Node> view_Reshape_460 = nullptr;
+    if (support_2d_rope) {
+        auto ListConstruct_379_Concat =
+            makePattern<opset1::Concat>({{-1}, {1}, seq_length, {ndims / 2}, {2}}, {{"axis", 0}});
+        auto const_target_shape_2 = makeConst({batch, 1, seq_len, ndims / 2, 2});
+
+        // Slice cos_sin_cache to support 2-dimentional RoPE
+        auto ScatterUpdate = makePattern<opset3::ScatterUpdate>({{0, 0}, {1}, seq_length, {0}}, {});
+        auto slice_Slice_449_1d = makePattern<ov::opset8::Slice>({cos_sin_cache, {0}, seq_length, {1}, {1}});
+        auto slice_Slice_449_2d = makePattern<ov::opset8::Slice>({cos_sin_cache, {0, 0}, ScatterUpdate, {1, 1}, {0}});
+        auto slice_StridedSlice_449 = GenStridedSlice(cos_sin_cache, {0, 0}, ScatterUpdate, {1, 1}, 1);
+
+        // [batch, 1, seq_length, half_rotary_dims, 2]
+        view_Reshape_460 = makePattern<opset1::Reshape>(
+            {slice_StridedSlice_449 | slice_Slice_449_1d | slice_Slice_449_2d | var_split_2->output(0),
+             ListConstruct_379_Concat | const_target_shape_2},
+            {{"special_zero", false}});
+    } else {
+        auto ListConstruct_379_Concat =
+            makePattern<opset1::Concat>({seq_length, {-1}, {1}, {ndims / 2}, {2}}, {{"axis", 0}});
+        auto const_target_shape_2 = makeConst({seq_len, batch, 1, ndims / 2, 2});
+
+        auto slice_Slice_449 = makePattern<ov::opset8::Slice>({cos_sin_cache, {0}, seq_length, {1}, {0}});
+        auto slice_StridedSlice_449 = GenStridedSlice(cos_sin_cache, {0}, seq_length, {1}, 0);
+
+        // [seq_length, 1, batch, half_rotary_dims, 2]
+        view_Reshape_460 =
+            makePattern<opset1::Reshape>({slice_StridedSlice_449 | slice_Slice_449 | var_split_2->output(0),
+                                          ListConstruct_379_Concat | const_target_shape_2},
+                                         {{"special_zero", false}});
+    }
 
     auto cos_tab = makePattern<opset8::Gather>({view_Reshape_460, 0, -1}, {{"batch_dims", 0}});
     auto x_even_cos = makePattern<opset1::Multiply>({x_even, cos_tab}, {{"auto_broadcast", "numpy"}});
@@ -487,11 +530,21 @@ ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id) {
     auto ShapeOf_135133 = makePattern<opset1::ShapeOf>({stack_481});
     auto flatten_Slice_497 = GenSlice(ShapeOf_135133, 0, 3, 1, 0);
     auto flatten_Concat_500 = makePattern<opset1::Concat>({flatten_Slice_497, {-1}}, {{"axis", 0}});
-    auto const_target_shape_3 = makeConst({seq_len, batch, head_cnt, ndims});
-    // [length, batch, head_cnt, half_rotary_dims, 2]
-    auto flatten_Reshape_501 =
-        makePattern<opset1::Reshape>({stack_481, flatten_Concat_500 | const_target_shape_3}, {{"special_zero", true}});
-    auto slice_Slice_443 = GenSlice(cur_key, ndims, INT_MAX, 1, 3);
+
+    std::shared_ptr<ov::Node> const_target_shape_3 = nullptr;
+    std::shared_ptr<ov::Node> flatten_Reshape_501 = nullptr;
+    if (support_2d_rope) {
+        // [batch, head_cnt, length, half_rotary_dims, 2]
+        const_target_shape_3 = makeConst({batch, head_cnt, seq_len, ndims});
+        flatten_Reshape_501 = makePattern<opset1::Reshape>({stack_481, flatten_Concat_500 | const_target_shape_3},
+                                                           {{"special_zero", true}});
+    } else {
+        // [length, batch, head_cnt, half_rotary_dims, 2]
+        const_target_shape_3 = makeConst({seq_len, batch, head_cnt, ndims});
+        flatten_Reshape_501 = makePattern<opset1::Reshape>({stack_481, flatten_Concat_500 | const_target_shape_3},
+                                                           {{"special_zero", true}});
+    }
+    auto slice_Slice_443 = GenSlice(input_key, ndims, INT_MAX, 1, 3);
 
     auto cat_Concat_505 =
         makePattern<opset1::Concat>({flatten_Reshape_501, slice_Slice_443 | var_split_1->output(1)}, {{"axis", -1}});
@@ -510,6 +563,7 @@ ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id) {
         OutputVector new_args;
         config.rotary_ndims = static_cast<size_t>(validator["ndims"]);
         config.is_chatglm = true;
+        config.support_2d_rope = support_2d_rope;
         config.head_cnt = static_cast<size_t>(validator["head_cnt"]);
         config.head_size = static_cast<size_t>(validator["head_size"]);
 
diff --git a/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp
index 5b54b4a7cce437..6eb0add525c815 100644
--- a/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp
+++ b/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp
@@ -135,6 +135,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_LLama2_no_gather) {
                                                        {"config.input_trans0213", true},
                                                        {"config.is_interleaved", false},
                                                        {"config.is_chatglm", false},
+                                                       {"config.support_2d_rope", false},
                                                        {"config.is_qwen", false},
                                                        {"config.head_cnt", 0},
                                                        {"config.head_size", 0},
@@ -170,6 +171,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_LLama2_with_gather) {
                                                        {"config.input_trans0213", true},
                                                        {"config.is_interleaved", false},
                                                        {"config.is_chatglm", false},
+                                                       {"config.support_2d_rope", false},
                                                        {"config.is_qwen", false},
                                                        {"config.head_cnt", 0},
                                                        {"config.head_size", 0},
@@ -308,6 +310,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTNEOX_no_gather) {
                                                     {"config.input_trans0213", true},
                                                     {"config.is_interleaved", false},
                                                     {"config.is_chatglm", false},
+                                                    {"config.support_2d_rope", false},
                                                     {"config.is_qwen", false},
                                                     {"config.head_cnt", 0},
                                                     {"config.head_size", 0},
@@ -342,6 +345,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTNEOX_with_gather) {
                                                     {"config.input_trans0213", true},
                                                     {"config.is_interleaved", false},
                                                     {"config.is_chatglm", false},
+                                                    {"config.support_2d_rope", false},
                                                     {"config.is_qwen", false},
                                                     {"config.head_cnt", 0},
                                                     {"config.head_size", 0},
@@ -457,6 +461,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTJ) {
                                                     {"config.input_trans0213", false},
                                                     {"config.is_interleaved", true},
                                                     {"config.is_chatglm", false},
+                                                    {"config.support_2d_rope", false},
                                                     {"config.is_qwen", false},
                                                     {"config.head_cnt", 0},
                                                     {"config.head_size", 0},
@@ -566,6 +571,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_chatGML) {
                                                     {"config.is_interleaved", false},
                                                     {"config.rotary_ndims", rotary_ndims},
                                                     {"config.is_chatglm", true},
+                                                    {"config.support_2d_rope", false},
                                                     {"config.is_qwen", false},
                                                     {"config.head_cnt", num_heads},
                                                     {"config.head_size", ndims},
@@ -643,6 +649,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_chatGML_Slice) {
                                                     {"config.is_interleaved", false},
                                                     {"config.rotary_ndims", rotary_ndims},
                                                     {"config.is_chatglm", true},
+                                                    {"config.support_2d_rope", false},
                                                     {"config.is_qwen", false},
                                                     {"config.head_cnt", num_heads},
                                                     {"config.head_size", ndims},
@@ -723,6 +730,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTJ_Slice) {
                                                     {"config.input_trans0213", false},
                                                     {"config.is_interleaved", true},
                                                     {"config.is_chatglm", false},
+                                                    {"config.support_2d_rope", false},
                                                     {"config.is_qwen", false},
                                                     {"config.head_cnt", 0},
                                                     {"config.head_size", 0},
@@ -730,4 +738,120 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTJ_Slice) {
                                                     {"config.gather_position_arg_id", 0}});
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{rope}, ov::ParameterVector{input, cos_sin});
     }
+}
+
+TEST_F(TransformationTestsF, ConvertToROPE_chatGML_2d_rope) {
+    disable_rt_info_check();
+    const int batch = 2;
+    const int seq_len = 7;
+    const int num_heads = 32;
+    const int ndims = 128;
+    const int rotary_ndims = 64;
+    const int max_pos_length = 2048;
+    {
+        auto input = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{batch, seq_len, 4608});
+        auto cos_sin_cache =
+            std::make_shared<ov::opset1::Parameter>(ov::element::f32,
+                                                    ov::PartialShape{max_pos_length, (rotary_ndims / 2), 2});
+        auto position_ids = std::make_shared<ov::opset1::Parameter>(ov::element::i32, ov::PartialShape{batch, seq_len});
+
+        auto __module_transformer_index_67_Gather =
+            makeOP<ov::opset8::Gather>({cos_sin_cache, position_ids, 0}, {{"batch_dims", 0}});
+
+        auto ListUnpack_321 = makeOP<ov::opset1::VariadicSplit>({input, -1, {4096, 256, 256}});
+        auto view_Reshape = makeOP<ov::opset1::Reshape>({ListUnpack_321->output(0), {0, 0, num_heads, ndims}},
+                                                        {{"special_zero", true}});
+
+        auto permute_Transpose = makeOP<ov::opset1::Transpose>({view_Reshape, {0, 2, 1, 3}}, {});
+
+        auto slice_Slice_357 =
+            makeOP<ov::opset1::StridedSlice>({permute_Transpose, {0, 0, 0, 0}, {0, 0, 0, rotary_ndims}, {1, 1, 1, 1}},
+                                             {{"begin_mask", {1, 1, 1, 0}},
+                                              {"end_mask", {1, 1, 1, 0}},
+                                              {"new_axis_mask", {}},
+                                              {"shrink_axis_mask", {}},
+                                              {"ellipsis_mask", {}}});
+
+        auto aten_view_Reshape_1 =
+            makeOP<ov::opset1::Reshape>({ListUnpack_321->output(1), {0, 0, 2, ndims}}, {{"special_zero", true}});
+        auto aten_transpose_1 = makeOP<ov::opset8::Transpose>({aten_view_Reshape_1, {0, 2, 1, 3}});
+        auto shape_of_105249 = makeOP<ov::opset8::ShapeOf>({aten_transpose_1}, {{"output_type", "i32"}});
+        auto gather_105252 = makeOP<ov::opset8::Gather>({shape_of_105249, {2}, {0}}, {{"batch_dims", 0}});
+        auto scatter_update_63441 = makeOP<ov::opset8::ScatterUpdate>({{0, 0}, {1}, gather_105252, {0}});
+        // connected to cos_sin_cache
+        auto slice_Slice_369 = makeOP<ov::opset1::StridedSlice>(
+            {__module_transformer_index_67_Gather, {0, 0}, scatter_update_63441, {1, 1}},
+            {{"begin_mask", {1, 0}},
+             {"end_mask", {1, 0}},
+             {"new_axis_mask", {}},
+             {"shrink_axis_mask", {}},
+             {"ellipsis_mask", {}}});
+        auto list_construct_concat_1 =
+            makeOP<ov::opset1::Concat>({{-1}, {1}, gather_105252, {rotary_ndims / 2}, {2}}, {{"axis", 0}});
+
+        auto reshape_Reshape_373 =
+            makeOP<ov::opset1::Reshape>({slice_Slice_357, {0, 32, 0, 32, 2}}, {{"special_zero", true}});
+        auto select_Gather_384 =
+            makeOP<ov::opset8::Gather>({reshape_Reshape_373, 0, -1}, {{"batch_dims", 0}});  // x_even
+        auto select_Gather_381 =
+            makeOP<ov::opset8::Gather>({reshape_Reshape_373, 1, -1}, {{"batch_dims", 0}});  // x_odd
+        auto view_Reshape_380 =
+            makeOP<ov::opset1::Reshape>({slice_Slice_369, list_construct_concat_1}, {{"special_zero", false}});
+        auto select_Gather_385 = makeOP<ov::opset8::Gather>({view_Reshape_380, 0, -1}, {{"batch_dims", 0}});  // cos_tab
+        auto select_Gather_382 = makeOP<ov::opset8::Gather>({view_Reshape_380, 1, -1}, {{"batch_dims", 0}});  // sin_tab
+
+        auto mul_Multiply_386 = makeOP<ov::opset1::Multiply>({select_Gather_381, select_Gather_382},
+                                                             {{"auto_broadcast", "numpy"}});  // x_odd_sin
+        auto mul_Multiply_383 = makeOP<ov::opset1::Multiply>({select_Gather_384, select_Gather_385},
+                                                             {{"auto_broadcast", "numpy"}});  // x_even_cos
+        auto Multiply_101315 =
+            makeOP<ov::opset1::Multiply>({mul_Multiply_386, -1.000000f}, {{"auto_broadcast", "numpy"}});
+        auto sub_Subtract_389 =
+            makeOP<ov::opset1::Add>({mul_Multiply_383, Multiply_101315}, {{"auto_broadcast", "numpy"}});
+
+        auto mul_Multiply_391 = makeOP<ov::opset1::Multiply>({select_Gather_381, select_Gather_385},
+                                                             {{"auto_broadcast", "numpy"}});  // x_odd_cos
+        auto mul_Multiply_393 = makeOP<ov::opset1::Multiply>({select_Gather_384, select_Gather_382},
+                                                             {{"auto_broadcast", "numpy"}});  // x_even_sin
+        auto add_Add_396 = makeOP<ov::opset1::Add>({mul_Multiply_391, mul_Multiply_393}, {{"auto_broadcast", "numpy"}});
+
+        auto Unsqueeze_62716 = makeOP<ov::opset1::Unsqueeze>({sub_Subtract_389, -1}, {});
+        auto Unsqueeze_62717 = makeOP<ov::opset1::Unsqueeze>({add_Add_396, -1}, {});
+
+        auto stack_401 = makeOP<ov::opset1::Concat>({Unsqueeze_62716, Unsqueeze_62717}, {{"axis", -1}});
+        auto flatten_Reshape_421 =
+            makeOP<ov::opset1::Reshape>({stack_401, {0, num_heads, 0, rotary_ndims}}, {{"special_zero", true}});
+        auto slice_Slice_363 = makeOP<ov::opset1::StridedSlice>(
+            {permute_Transpose, {0, 0, 0, rotary_ndims}, {0, 0, 0, INT_MAX}, {1, 1, 1, 1}},
+            {{"begin_mask", {1, 1, 1, 0}},
+             {"end_mask", {1, 1, 1, 0}},
+             {"new_axis_mask", {}},
+             {"shrink_axis_mask", {}},
+             {"ellipsis_mask", {}}});
+        auto cat_Concat_425 = makeOP<ov::opset1::Concat>({flatten_Reshape_421, slice_Slice_363}, {{"axis", -1}});
+        model = std::make_shared<ov::Model>(ov::NodeVector{cat_Concat_425},
+                                            ov::ParameterVector{input, cos_sin_cache, position_ids});
+    }
+    manager.register_pass<ov::pass::RoPEFusion>(true);
+    {
+        auto input = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{batch, seq_len, 4608});
+        auto cos_sin_cache =
+            std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::Shape{max_pos_length, (rotary_ndims / 2), 2});
+        auto position_ids = std::make_shared<ov::opset1::Parameter>(ov::element::i32, ov::PartialShape{batch, seq_len});
+        auto gather_cos_sin = makeOP<ov::opset8::Gather>({cos_sin_cache, position_ids, 0}, {{"batch_dims", 0}});
+        auto rope = makeOP<ov::op::internal::RoPE>({input, gather_cos_sin, gather_cos_sin},
+                                                   {{"config.slice_start", 0},
+                                                    {"config.slice_stop", 4096},
+                                                    {"config.input_trans0213", false},
+                                                    {"config.is_interleaved", false},
+                                                    {"config.rotary_ndims", rotary_ndims},
+                                                    {"config.is_chatglm", true},
+                                                    {"config.support_2d_rope", true},
+                                                    {"config.is_qwen", false},
+                                                    {"config.head_cnt", num_heads},
+                                                    {"config.head_size", ndims},
+                                                    {"config.gather_position_arg_id", 0}});
+        model_ref =
+            std::make_shared<ov::Model>(ov::NodeVector{rope}, ov::ParameterVector{input, cos_sin_cache, position_ids});
+    }
 }
\ No newline at end of file
diff --git a/src/common/util/include/openvino/util/mmap_object.hpp b/src/common/util/include/openvino/util/mmap_object.hpp
index 364e1eed4ca712..3aba8e69c094a1 100644
--- a/src/common/util/include/openvino/util/mmap_object.hpp
+++ b/src/common/util/include/openvino/util/mmap_object.hpp
@@ -50,18 +50,4 @@ std::shared_ptr<ov::MappedMemory> load_mmap_object(const std::string& path);
 std::shared_ptr<ov::MappedMemory> load_mmap_object(const std::wstring& path);
 
 #endif  // OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
-
-class MmapStream final : public std::ifstream {
-public:
-    MmapStream(const std::string& path) : std::ifstream(path, std::ios_base::binary) {
-        m_memory = ov::load_mmap_object(path);
-    }
-
-#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
-    MmapStream(const std::wstring& path);
-#endif  // OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
-
-    std::shared_ptr<ov::MappedMemory> m_memory;
-};
-
 }  // namespace ov
diff --git a/src/common/util/src/os/win/win_mmap_object.cpp b/src/common/util/src/os/win/win_mmap_object.cpp
index 0b14d7ac774700..6f2515124273f1 100644
--- a/src/common/util/src/os/win/win_mmap_object.cpp
+++ b/src/common/util/src/os/win/win_mmap_object.cpp
@@ -141,11 +141,6 @@ std::shared_ptr<ov::MappedMemory> load_mmap_object(const std::wstring& path) {
     holder->set(path);
     return holder;
 }
-
-MmapStream::MmapStream(const std::wstring& path) : std::ifstream(path.data(), std::ios_base::binary) {
-    m_memory = ov::load_mmap_object(path);
-}
-
 #endif
 
 }  // namespace ov
diff --git a/src/core/dev_api/openvino/op/paged_attention.hpp b/src/core/dev_api/openvino/op/paged_attention.hpp
index e5995e0b8699b0..0c1c396cbefb5b 100644
--- a/src/core/dev_api/openvino/op/paged_attention.hpp
+++ b/src/core/dev_api/openvino/op/paged_attention.hpp
@@ -17,6 +17,11 @@ class OPENVINO_API PagedAttentionExtension : public ov::op::Op {
     PagedAttentionExtension(const ov::OutputVector& args);
     void validate_and_infer_types() override;
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
+    void set_out_type(int index, const ov::element::Type& output_type);
+
+protected:
+    std::vector<ov::element::Type> m_output_type = {ov::element::undefined, ov::element::undefined};
 };
 
 }  // namespace op
diff --git a/src/core/dev_api/openvino/runtime/shared_buffer.hpp b/src/core/dev_api/openvino/runtime/shared_buffer.hpp
index 7f1e2e9ba7601f..2c784ef6081c35 100644
--- a/src/core/dev_api/openvino/runtime/shared_buffer.hpp
+++ b/src/core/dev_api/openvino/runtime/shared_buffer.hpp
@@ -8,7 +8,7 @@
 
 namespace ov {
 
-/// \brief SharedBuffer class to store pointer to pre-allocated buffer.
+/// \brief SharedBuffer class to store pointer to pre-allocated buffer. Own the shared object.
 template <typename T>
 class SharedBuffer : public ov::AlignedBuffer {
 public:
@@ -28,4 +28,60 @@ class SharedBuffer : public ov::AlignedBuffer {
     T _shared_object;
 };
 
+/// \brief SharedStreamBuffer class to store pointer to pre-acclocated buffer and provide streambuf interface.
+///  Can return ptr to shared memory and its size
+class SharedStreamBuffer : public std::streambuf {
+public:
+    SharedStreamBuffer(char* data, size_t size) : m_data(data), m_size(size), m_offset(0) {}
+
+protected:
+    // override std::streambuf methods
+    std::streamsize xsgetn(char* s, std::streamsize count) override {
+        auto real_count = std::min<std::streamsize>(m_size - m_offset, count);
+        std::memcpy(s, m_data + m_offset, real_count);
+        m_offset += real_count;
+        return real_count;
+    }
+
+    int_type underflow() override {
+        return (m_size == m_offset) ? traits_type::eof() : traits_type::to_int_type(*(m_data + m_offset));
+    }
+
+    int_type uflow() override {
+        return (m_size == m_offset) ? traits_type::eof() : traits_type::to_int_type(*(m_data + m_offset++));
+    }
+
+    std::streamsize showmanyc() override {
+        return m_size - m_offset;
+    }
+
+    pos_type seekoff(off_type off,
+                     std::ios_base::seekdir dir,
+                     std::ios_base::openmode which = std::ios_base::in) override {
+        if (dir != std::ios_base::cur || which != std::ios_base::in) {
+            return pos_type(off_type(-1));
+        }
+        m_offset += off;
+        return pos_type(m_offset);
+    }
+
+    char* m_data;
+    size_t m_size;
+    size_t m_offset;
+};
+
+/// \brief OwningSharedStreamBuffer is a SharedStreamBuffer which owns its shared object.
+class OwningSharedStreamBuffer : public SharedStreamBuffer {
+public:
+    OwningSharedStreamBuffer(std::shared_ptr<ov::AlignedBuffer> buffer)
+        : SharedStreamBuffer(static_cast<char*>(buffer->get_ptr()), buffer->size()),
+          m_shared_obj(buffer) {}
+
+    std::shared_ptr<ov::AlignedBuffer> get_buffer() {
+        return m_shared_obj;
+    }
+
+protected:
+    std::shared_ptr<ov::AlignedBuffer> m_shared_obj;
+};
 }  // namespace ov
diff --git a/src/core/include/openvino/op/search_sorted.hpp b/src/core/include/openvino/op/search_sorted.hpp
index 78650942ee8f0f..c370ba46b2f182 100644
--- a/src/core/include/openvino/op/search_sorted.hpp
+++ b/src/core/include/openvino/op/search_sorted.hpp
@@ -36,8 +36,6 @@ class OPENVINO_API SearchSorted : public Op {
         m_right_mode = right_mode;
     }
 
-    bool validate() const;
-
 private:
     bool m_right_mode{};
 };
diff --git a/src/core/reference/include/openvino/reference/proposal.hpp b/src/core/reference/include/openvino/reference/proposal.hpp
index f80faafc5efd27..a2d727b6156aea 100644
--- a/src/core/reference/include/openvino/reference/proposal.hpp
+++ b/src/core/reference/include/openvino/reference/proposal.hpp
@@ -319,6 +319,11 @@ static void proposal_exec(const T* class_probs,
                           const Shape& output_shape,
                           const Shape& out_probs_shape,
                           const op::v0::Proposal::Attributes& attrs) {
+    const auto batch_num = static_cast<unsigned int>(class_probs_shape[0]);
+    const auto coordinates_offset = attrs.framework == "tensorflow" ? 0.f : 1.f;
+    const auto initial_clip = attrs.framework == "tensorflow";
+    const auto swap_xy = attrs.framework == "tensorflow";
+
     const T* p_bottom_item = class_probs;
     const T* p_d_anchor_item = bbox_deltas;
     T* p_roi_item = output;
@@ -328,8 +333,8 @@ static void proposal_exec(const T* class_probs,
     const unsigned int bottom_H = static_cast<unsigned int>(class_probs_shape[2]);
     const unsigned int bottom_W = static_cast<unsigned int>(class_probs_shape[3]);
     // input image height and width
-    const T img_H = image_shape[0];
-    const T img_W = image_shape[1];
+    const T img_H = image_shape[swap_xy ? 1 : 0];
+    const T img_W = image_shape[swap_xy ? 0 : 1];
     // scale factor for H and W, depends on shape of image_shape
     // can be split into H and W {image_height, image_width, scale_height,
     // scale_width}
@@ -350,11 +355,6 @@ static void proposal_exec(const T* class_probs,
 
     std::vector<float> anchors = generate_anchors(attrs, anchor_count);
 
-    unsigned int batch_num = static_cast<unsigned int>(class_probs_shape[0]);
-    float coordinates_offset = attrs.framework == "tensorflow" ? 0.0f : 1.0f;
-    bool initial_clip = attrs.framework == "tensorflow";
-    bool swap_xy = attrs.framework == "tensorflow";
-
     for (unsigned int batch_idx = 0; batch_idx < batch_num; ++batch_idx) {
         std::fill(roi_indices.begin(), roi_indices.end(), 0);
         num_rois = 0;
diff --git a/src/core/shape_inference/include/search_sorted_shape_inference.hpp b/src/core/shape_inference/include/search_sorted_shape_inference.hpp
index 7ea0598cffbc87..4b9d888891e835 100644
--- a/src/core/shape_inference/include/search_sorted_shape_inference.hpp
+++ b/src/core/shape_inference/include/search_sorted_shape_inference.hpp
@@ -12,28 +12,40 @@ namespace op {
 namespace v15 {
 template <class TShape, class TRShape = result_shape_t<TShape>>
 std::vector<TRShape> shape_infer(const SearchSorted* op, const std::vector<TShape>& input_shapes) {
-    // [HACK]: By convention, shape_infer should also perform node validation..
-    op->validate();
     const auto& sorted_shape = input_shapes[0];
     const auto& values_shape = input_shapes[1];
+    const auto is_sorted_rank_static = sorted_shape.rank().is_static();
+    const auto is_values_rank_static = values_shape.rank().is_static();
 
-    auto output_shape = values_shape;
-
-    // 1. If we know that the sorted sequence is 1D, than output shape can be anything.
-    if (sorted_shape.rank().is_static() && sorted_shape.rank().get_length() == 1) {
-        return {std::move(output_shape)};
+    if (!is_sorted_rank_static || sorted_shape.size() == 1) {
+        // If the sorted sequence is 1D, then any shape of the values input is allowed.
+        // The shape of the output is the same as the shape of the values.
+        return {values_shape};
     }
 
-    // 2. ND tensor case or rank not known.
-    auto sorted_shape_last_dynamic = sorted_shape;
-    if (sorted_shape.rank().is_static()) {
-        sorted_shape_last_dynamic[sorted_shape.rank().get_length() - 1] = Dimension::dynamic();
+    const auto sorted_in_rank = sorted_shape.size();
+    NODE_SHAPE_INFER_CHECK(op, input_shapes, sorted_in_rank > 0, "The sorted sequence input cannot be a scalar.");
+
+    TRShape output_shape;
+    if (!is_values_rank_static) {
+        output_shape = sorted_shape;
+        output_shape[sorted_in_rank - 1] = Dimension::dynamic();
+    } else {
+        output_shape = values_shape;
+        NODE_SHAPE_INFER_CHECK(
+            op,
+            input_shapes,
+            sorted_in_rank == values_shape.size(),
+            "If the shape of sorted sequence is not 1D, the ranks of the inputs have to be compatible.");
+        using TDim = typename TShape::value_type;
+        for (size_t i = 0; i < sorted_in_rank - 1; ++i) {
+            NODE_SHAPE_INFER_CHECK(op,
+                                   input_shapes,
+                                   TDim::merge(output_shape[i], values_shape[i], sorted_shape[i]),
+                                   "All dimensions but the last one have to be compatible.");
+        }
     }
 
-    const bool sorted_values_merge_success = TShape::merge_into(output_shape, sorted_shape_last_dynamic);
-
-    NODE_VALIDATION_CHECK(op, sorted_values_merge_success, "Shapes of sorted sequence and values are not compatible.");
-
     return {std::move(output_shape)};
 }
 }  // namespace v15
diff --git a/src/core/src/op/paged_attention.cpp b/src/core/src/op/paged_attention.cpp
index e3771bcbf92937..261b0ce1c47605 100644
--- a/src/core/src/op/paged_attention.cpp
+++ b/src/core/src/op/paged_attention.cpp
@@ -146,13 +146,27 @@ void PagedAttentionExtension::validate_and_infer_types() {
                           get_input_element_type(12),
                           ".");
 
-    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
-    set_output_type(1, get_input_element_type(0), {Dimension::dynamic()});
+    if (m_output_type[0] == ov::element::undefined) {
+        set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+    } else {
+        set_output_type(0, m_output_type[0], get_input_partial_shape(0));
+    }
+
+    if (m_output_type[1] == ov::element::undefined) {
+        set_output_type(1, get_input_element_type(0), {Dimension::dynamic()});
+    } else {
+        set_output_type(1, m_output_type[1], {Dimension::dynamic()});
+    }
 }
 
 std::shared_ptr<ov::Node> PagedAttentionExtension::clone_with_new_inputs(const ov::OutputVector& new_args) const {
     return std::make_shared<PagedAttentionExtension>(new_args);
 }
 
+void PagedAttentionExtension::set_out_type(int index, const ov::element::Type& output_type) {
+    OPENVINO_ASSERT(index < 2, "Output index should be 0 or 1, but got " + std::to_string(index));
+    m_output_type[index] = output_type;
+}
+
 }  // namespace op
 }  // namespace ov
diff --git a/src/core/src/op/search_sorted.cpp b/src/core/src/op/search_sorted.cpp
index d3f26a674eef91..8b9bb012b27106 100644
--- a/src/core/src/op/search_sorted.cpp
+++ b/src/core/src/op/search_sorted.cpp
@@ -18,34 +18,11 @@ SearchSorted::SearchSorted(const Output<Node>& sorted_sequence, const Output<Nod
     constructor_validate_and_infer_types();
 }
 
-bool SearchSorted::validate() const {
-    NODE_VALIDATION_CHECK(this, get_input_size() == 2);
+void SearchSorted::validate_and_infer_types() {
+    OV_OP_SCOPE(v15_SearchSorted_validate_and_infer_types);
     NODE_VALIDATION_CHECK(this,
                           get_input_element_type(0).compatible(get_input_element_type(1)),
                           "Sorted sequence and values must have the same element type.");
-
-    const auto& sorted_shape = get_input_partial_shape(0);
-    const auto& values_shape = get_input_partial_shape(1);
-
-    if (sorted_shape.rank().is_static() && values_shape.rank().is_static() && sorted_shape.rank().get_length() > 1) {
-        NODE_VALIDATION_CHECK(this,
-                              sorted_shape.rank().get_length() == values_shape.rank().get_length(),
-                              "Sorted sequence and values have different ranks.");
-
-        for (int64_t i = 0; i < sorted_shape.rank().get_length() - 1; ++i) {
-            NODE_VALIDATION_CHECK(this,
-                                  sorted_shape[i].compatible(values_shape[i]),
-                                  "Sorted sequence and values has different ",
-                                  i,
-                                  " dimension.");
-        }
-    }
-
-    return true;
-}
-
-void SearchSorted::validate_and_infer_types() {
-    OV_OP_SCOPE(v15_SearchSorted_validate_and_infer_types);
     const auto& output_shapes = shape_infer(this, ov::util::get_node_input_partial_shapes(*this));
     set_output_type(0, ov::element::i64, output_shapes[0]);
 }
diff --git a/src/core/tests/type_prop/search_sorted.cpp b/src/core/tests/type_prop/search_sorted.cpp
index efc2c865416143..6dd10ad0ac3f5f 100644
--- a/src/core/tests/type_prop/search_sorted.cpp
+++ b/src/core/tests/type_prop/search_sorted.cpp
@@ -57,6 +57,10 @@ TEST(type_prop, search_sorted_shape_infer_sorted_1d_values_dynamic) {
     PerformShapeTest({8}, {-1, -1, 3}, {-1, -1, 3});
 }
 
+TEST(type_prop, search_sorted_shape_infer_scalar_values) {
+    PerformShapeTest({100}, {}, {});
+}
+
 TEST(type_prop, search_sorted_shape_infer_both_dynamic_1) {
     PerformShapeTest({1, -1, 7, -1}, {-1, 3, -1, 10}, {1, 3, 7, 10});
 }
@@ -93,6 +97,19 @@ TEST(type_prop, search_sorted_shape_infer_both_dynamic_9) {
     PerformShapeTest({-1, -1}, PartialShape::dynamic(), {-1, -1});
 }
 
+TEST(type_prop, search_sorted_shape_symbols) {
+    PartialShape sorted_shape{1, 3, 7, 100};
+    PartialShape values_shape{-1, -1, -1, 10};
+    auto sorted_symbols = set_shape_symbols(sorted_shape);
+    auto values_symbols = set_shape_symbols(values_shape);
+    auto sorted = make_shared<op::v0::Parameter>(element::i32, sorted_shape);
+    auto values = make_shared<op::v0::Parameter>(element::i32, values_shape);
+    auto search_sorted_op = make_shared<op::v15::SearchSorted>(sorted, values);
+    EXPECT_EQ(search_sorted_op->get_element_type(), element::i64);
+    EXPECT_THAT(get_shape_symbols(search_sorted_op->get_output_partial_shape(0)),
+                testing::ElementsAre(values_symbols[0], values_symbols[1], values_symbols[2], values_symbols[3]));
+}
+
 TEST(type_prop, search_sorted_shape_infer_different_types) {
     auto sorted = make_shared<ov::op::v0::Parameter>(element::f32, Shape{1, 3, 6});
     auto values = make_shared<ov::op::v0::Parameter>(element::i32, Shape{1, 3, 6});
@@ -102,13 +119,27 @@ TEST(type_prop, search_sorted_shape_infer_different_types) {
 TEST(type_prop, search_sorted_shape_infer_wrong_rank) {
     auto sorted = make_shared<ov::op::v0::Parameter>(element::i32, Shape{1, 1, 3, 6});
     auto values = make_shared<ov::op::v0::Parameter>(element::i32, Shape{1, 3, 6});
-    EXPECT_THROW_SUBSTRING(sorted, values, std::string("Sorted sequence and values have different ranks"));
+    EXPECT_THROW_SUBSTRING(sorted,
+                           values,
+                           std::string("sequence is not 1D, the ranks of the inputs have to be compatible"));
 }
 
 TEST(type_prop, search_sorted_shape_infer_wrong_dim) {
     auto sorted = make_shared<ov::op::v0::Parameter>(element::i32, Shape{1, 1, 3, 6});
     auto values = make_shared<ov::op::v0::Parameter>(element::i32, Shape{1, 1, 5, 6});
-    EXPECT_THROW_SUBSTRING(sorted, values, std::string(" different 2 dimension."));
+    EXPECT_THROW_SUBSTRING(sorted, values, std::string("All dimensions but the last one have to be compatible"));
+}
+
+TEST(type_prop, search_sorted_shape_infer_scalar_sorted_sequence) {
+    auto sorted = make_shared<ov::op::v0::Parameter>(element::i32, Shape{});
+    auto values = make_shared<ov::op::v0::Parameter>(element::i32, Shape{1, 1, 5, 6});
+    EXPECT_THROW_SUBSTRING(sorted, values, std::string("The sorted sequence input cannot be a scalar"));
+}
+
+TEST(type_prop, search_sorted_shape_infer_scalar_values_and_nd_sequence) {
+    auto sorted = make_shared<ov::op::v0::Parameter>(element::i32, Shape{2, 2});
+    auto values = make_shared<ov::op::v0::Parameter>(element::i32, Shape{});
+    EXPECT_THROW_SUBSTRING(sorted, values, std::string("the ranks of the inputs have to be compatible"));
 }
 
 #undef EXPECT_THROW_SUBSTRING
\ No newline at end of file
diff --git a/src/frontends/onnx/frontend/CMakeLists.txt b/src/frontends/onnx/frontend/CMakeLists.txt
index 80fd16e2ed6483..f07b6cf999fea8 100644
--- a/src/frontends/onnx/frontend/CMakeLists.txt
+++ b/src/frontends/onnx/frontend/CMakeLists.txt
@@ -77,7 +77,7 @@ ov_add_frontend(NAME onnx
                 FILEDESCRIPTION "FrontEnd to load and convert ONNX file format"
                 LINK_LIBRARIES openvino_onnx_common openvino::core::dev)
 
-set(ONNX_OPSET_VERSION 20 CACHE INTERNAL "Supported version of ONNX operator set")
+set(ONNX_OPSET_VERSION 21 CACHE INTERNAL "Supported version of ONNX operator set")
 target_compile_definitions(${TARGET_NAME} PRIVATE ONNX_OPSET_VERSION=${ONNX_OPSET_VERSION})
 
 if(BUILD_SHARED_LIBS)
diff --git a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp
index fe2ea3106e31ee..b09bc73467bc10 100644
--- a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp
+++ b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp
@@ -11,10 +11,15 @@
 #include "openvino/frontend/exception.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/convert.hpp"
+#include "openvino/op/convert_like.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/reshape.hpp"
+#include "openvino/op/shape_of.hpp"
 #include "openvino/op/subtract.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/op/unsqueeze.hpp"
 #include "utils/common.hpp"
+#include "utils/reshape.hpp"
 using namespace ov::op;
 
 namespace ov {
@@ -188,8 +193,78 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) {
     // these reshapes make sure that dequantization happens over the specified axis
     return detail::dequantize_linear(x, scale, zero_point, node.get_attribute_value<int64_t>("axis", 1), node);
 }
-ONNX_OP("DequantizeLinear", OPSET_SINCE(13), ai_onnx::opset_13::dequantize_linear);
+ONNX_OP("DequantizeLinear", {13, 18}, ai_onnx::opset_13::dequantize_linear);
 }  // namespace opset_13
+
+namespace opset_19 {
+ONNX_OP("DequantizeLinear", {19, 20}, ai_onnx::opset_13::dequantize_linear);
+}  // namespace opset_19
+
+namespace opset_21 {
+ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) {
+    common::default_op_checks(node, 2);
+
+    const ov::OutputVector inputs{node.get_ov_inputs()};
+    const auto& src_x = inputs[0];
+    ov::Output<ov::Node> scale = inputs[1];
+    const auto& scale_shape = scale.get_partial_shape();
+    ov::Output<ov::Node> zp;
+
+    // When no blocking dequantization is required - use regular DequantizeLinear
+    if (scale_shape.rank().is_static() && scale_shape.rank().get_length() <= 1) {
+        return ai_onnx::opset_13::dequantize_linear(node);
+    }
+
+    FRONT_END_GENERAL_CHECK(scale_shape.rank().is_static(), "Rank of the input data tensor has to be known (static).");
+    FRONT_END_GENERAL_CHECK(scale_shape.rank().get_length() == 2,
+                            "DequantizeLinear cannot operate with more than 2D scales");
+    FRONT_END_GENERAL_CHECK(src_x.get_partial_shape().is_static(),
+                            "DequantizeLinear cannot operate with dynamic shapes of input X");
+
+    const auto& unsqueezed_axes = std::make_shared<v0::Constant>(ov::element::i64, Shape{1}, std::vector<int64_t>{1});
+
+    if (inputs.size() > 2) {
+        zp = inputs[2];
+        if (zp.get_element_type() != scale.get_element_type()) {
+            zp = std::make_shared<v1::ConvertLike>(zp, scale);
+        }
+        zp = std::make_shared<v0::Unsqueeze>(zp, unsqueezed_axes);
+    }
+
+    const auto axis = node.get_attribute_value<int64_t>("axis", 1);
+    const auto block_size = static_cast<size_t>(node.get_attribute_value<int64_t>("block_size", 0));
+    const auto scale_type = scale.get_element_type();
+
+    FRONT_END_GENERAL_CHECK(axis == 0, "Axis != 0 isn't supported");
+    FRONT_END_GENERAL_CHECK(block_size > 0, "block_size must be greater than zero");
+    FRONT_END_GENERAL_CHECK(
+        src_x.get_shape()[0] % block_size == 0,
+        "DequantizeLinear doesn't support case when first dimension of X cannot be divided by block_size");
+
+    const auto& x = src_x.get_element_type() == scale_type ? src_x : std::make_shared<v1::ConvertLike>(src_x, scale);
+    // For further broadcasting scales and zp - reshape input to a shape [x.shape[0]/block_size, block_size, x.shape[1]]
+    ov::Output<ov::Node> broadcastable_x =
+        op::util::reshape(x, Shape{static_cast<size_t>(x.get_shape()[0]) / block_size, block_size, x.get_shape()[1]});
+
+    // Adding additional dimension for broadcasting
+    scale = std::make_shared<v0::Unsqueeze>(scale, unsqueezed_axes);
+
+    if (zp.get_node_shared_ptr()) {
+        broadcastable_x = std::make_shared<v1::Subtract>(broadcastable_x, zp);
+    }
+
+    const auto& scaled_x = std::make_shared<v1::Multiply>(broadcastable_x, scale);
+
+    // Returning back a shape
+    const auto& reshaped_scaled_x =
+        std::make_shared<v1::Reshape>(scaled_x, std::make_shared<v0::ShapeOf>(src_x), false);
+
+    reshaped_scaled_x->set_friendly_name(node.get_name());
+
+    return {reshaped_scaled_x};
+}
+ONNX_OP("DequantizeLinear", OPSET_SINCE(21), ai_onnx::opset_21::dequantize_linear);
+}  // namespace opset_21
 }  // namespace ai_onnx
 }  // namespace onnx
 }  // namespace frontend
diff --git a/src/frontends/onnx/tests/models/dequantize_linear_21.prototxt b/src/frontends/onnx/tests/models/dequantize_linear_21.prototxt
new file mode 100644
index 00000000000000..0378ad13ce0ce9
--- /dev/null
+++ b/src/frontends/onnx/tests/models/dequantize_linear_21.prototxt
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "OpenVINO ONNX Frontend"
+graph {
+  name: "test_dequantize_21"
+  initializer {
+      dims: 6
+      dims: 3
+      data_type: 21
+      name: "data"
+      raw_data: "\x99\x99\x99\x99\x99\x99\x99\x99\x99"
+  }
+  initializer {
+      dims: 2
+      dims: 3
+      data_type: 1
+      name: "scale"
+      raw_data: "\x00\x00\x80\x3f\x00\x00\x80\x3f\x00\x00\x80\x3f\x00\x00\x80\x3f\x00\x00\x80\x3f\x00\x00\x80\x3f"
+  }
+  initializer {
+      dims: 2
+      dims: 3
+      data_type: 21
+      name: "zp"
+      raw_data: "\x78\x56\x34"
+  }
+  node {
+    input: "data"
+    input: "scale"
+    input: "zp"
+    output: "output"
+    name: "DequantizeNode"
+    op_type: "DequantizeLinear"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "block_size"
+      i: 3
+      type: INT
+    }
+  }
+  output {
+    name: "output"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 6
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 21
+}
diff --git a/src/frontends/onnx/tests/onnx_import_quant.in.cpp b/src/frontends/onnx/tests/onnx_import_quant.in.cpp
index c2d48c292cb8c1..ad85ef98ede8d9 100644
--- a/src/frontends/onnx/tests/onnx_import_quant.in.cpp
+++ b/src/frontends/onnx/tests/onnx_import_quant.in.cpp
@@ -317,6 +317,16 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_model_dequantize_linear_1d_zero_scale_uint8_
     test_case.run();
 }
 
+OPENVINO_TEST(${BACKEND_NAME}, onnx_model_dequantize_linear_opset21) {
+    auto model = convert_model("dequantize_linear_21.onnx");
+
+    auto test_case = ov::test::TestCase(model, s_device);
+
+    test_case.add_expected_output<float>({6, 3},
+                                         std::vector<float>{1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6});
+    test_case.run();
+}
+
 OPENVINO_TEST(${BACKEND_NAME}, onnx_model_dequantize_linear_scalar_ignore_axis) {
     auto model = convert_model("dequantize_linear_scalar_ignore_axis.onnx");
 
diff --git a/src/inference/src/cache_manager.hpp b/src/inference/src/cache_manager.hpp
index 9e9ebd3ddcbc2b..c441811c3cfd02 100644
--- a/src/inference/src/cache_manager.hpp
+++ b/src/inference/src/cache_manager.hpp
@@ -14,6 +14,7 @@
 #include <memory>
 #include <string>
 
+#include "openvino/runtime/shared_buffer.hpp"
 #include "openvino/util/file_util.hpp"
 #include "openvino/util/mmap_object.hpp"
 
@@ -77,9 +78,10 @@ class ICacheManager {
      * Otherwise, model will not be read from cache and will be loaded as usual
      *
      * @param id Id of cache (hash of the model)
+     * @param enable_mmap use mmap or ifstream to read model file
      * @param reader Lambda function to be called when input stream is created
      */
-    virtual void read_cache_entry(const std::string& id, StreamReader reader, bool mmap = false) = 0;
+    virtual void read_cache_entry(const std::string& id, bool enable_mmap, StreamReader reader) = 0;
 
     /**
      * @brief Callback when OpenVINO intends to remove cache entry
@@ -130,13 +132,17 @@ class FileStorageCacheManager final : public ICacheManager {
         writer(stream);
     }
 
-    void read_cache_entry(const std::string& id, StreamReader reader, bool mmap = false) override {
+    void read_cache_entry(const std::string& id, bool enable_mmap, StreamReader reader) override {
         // Fix the bug caused by pugixml, which may return unexpected results if the locale is different from "C".
         ScopedLocale plocal_C(LC_ALL, "C");
         auto blob_file_name = getBlobFile(id);
         if (ov::util::file_exists(blob_file_name)) {
-            if (mmap) {
-                MmapStream stream(blob_file_name);
+            if (enable_mmap) {
+                auto mmap = ov::load_mmap_object(blob_file_name);
+                auto shared_buffer =
+                    std::make_shared<ov::SharedBuffer<std::shared_ptr<MappedMemory>>>(mmap->data(), mmap->size(), mmap);
+                OwningSharedStreamBuffer buf(shared_buffer);
+                std::istream stream(&buf);
                 reader(stream);
             } else {
                 std::ifstream stream(blob_file_name, std::ios_base::binary);
diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp
index 9f55dc53ccd24f..32b43f346e9e44 100644
--- a/src/inference/src/dev/core_impl.cpp
+++ b/src/inference/src/dev/core_impl.cpp
@@ -1397,19 +1397,12 @@ ov::SoPtr<ov::ICompiledModel> ov::CoreImpl::compile_model_and_cache(ov::Plugin&
     return compiled_model;
 }
 
-static bool does_plugin_support_model_caching_with_mmap(const ov::Plugin& plugin) {
-    bool supported = plugin.supports_model_caching();
-    supported &=
-        ov::util::contains(plugin.get_property(ov::internal::supported_properties), ov::internal::caching_with_mmap);
-    return supported;
-}
-
 ov::SoPtr<ov::ICompiledModel> ov::CoreImpl::load_model_from_cache(
     const CacheContent& cacheContent,
     ov::Plugin& plugin,
     const ov::AnyMap& config,
     const ov::SoPtr<ov::IRemoteContext>& context,
-    std::function<ov::SoPtr<ov::ICompiledModel>()> compile_model_lambda) {
+    std::function<ov::SoPtr<ov::ICompiledModel>()> compile_model_lambda) const {
     ov::SoPtr<ov::ICompiledModel> compiled_model;
     struct HeaderException {};
 
@@ -1418,6 +1411,8 @@ ov::SoPtr<ov::ICompiledModel> ov::CoreImpl::load_model_from_cache(
     try {
         cacheContent.cacheManager->read_cache_entry(
             cacheContent.blobId,
+            coreConfig.get_enable_mmap() && ov::util::contains(plugin.get_property(ov::internal::supported_properties),
+                                                               ov::internal::caching_with_mmap),
             [&](std::istream& networkStream) {
                 OV_ITT_SCOPE(FIRST_INFERENCE,
                              ov::itt::domains::LoadTime,
@@ -1454,8 +1449,7 @@ ov::SoPtr<ov::ICompiledModel> ov::CoreImpl::load_model_from_cache(
                 update_config[ov::loaded_from_cache.name()] = true;
                 compiled_model = context ? plugin.import_model(networkStream, context, update_config)
                                          : plugin.import_model(networkStream, update_config);
-            },
-            does_plugin_support_model_caching_with_mmap(plugin));
+            });
     } catch (const HeaderException&) {
         // For these exceptions just remove old cache and set that import didn't work
         cacheContent.cacheManager->remove_cache_entry(cacheContent.blobId);
diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp
index 79b1b96d57ac30..7cf12f3ba3280c 100644
--- a/src/inference/src/dev/core_impl.hpp
+++ b/src/inference/src/dev/core_impl.hpp
@@ -149,12 +149,12 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this<ov::ICore
                                                           const ov::SoPtr<ov::IRemoteContext>& context,
                                                           const CacheContent& cacheContent) const;
 
-    static ov::SoPtr<ov::ICompiledModel> load_model_from_cache(
+    ov::SoPtr<ov::ICompiledModel> load_model_from_cache(
         const CacheContent& cacheContent,
         ov::Plugin& plugin,
         const ov::AnyMap& config,
         const ov::SoPtr<ov::IRemoteContext>& context,
-        std::function<ov::SoPtr<ov::ICompiledModel>()> compile_model_lambda);
+        std::function<ov::SoPtr<ov::ICompiledModel>()> compile_model_lambda) const;
 
     bool device_supports_model_caching(const ov::Plugin& plugin) const;
 
diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp
index a4c99e2cc1fca7..d6208e0a43bbe1 100644
--- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp
@@ -1982,6 +1982,51 @@ std::set<std::vector<element::Type>> jit_soft_sign_emitter::get_supported_precis
     return {{element::f32}};
 }
 
+/// SQUARE_ROOT ///
+jit_sqrt_emitter::jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
+                                   dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
+                                   const std::shared_ptr<ov::Node>& node)
+    : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {
+        prepare_table();
+    }
+
+jit_sqrt_emitter::jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
+                                   dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
+                                   const ov::element::Type exec_prc)
+    : jit_emitter(host, host_isa, exec_prc) {
+        prepare_table();
+    }
+
+size_t jit_sqrt_emitter::get_inputs_count() const {
+    return 1;
+}
+
+void jit_sqrt_emitter::emit_impl(const std::vector<size_t>& in_vec_idxs,
+                                 const std::vector<size_t>& out_vec_idxs) const {
+    if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) {
+        emit_isa<dnnl::impl::cpu::aarch64::asimd>(in_vec_idxs, out_vec_idxs);
+    } else {
+        OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel");
+    }
+}
+
+template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
+void jit_sqrt_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
+                                const std::vector<size_t>& out_vec_idxs) const {
+    OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string());
+
+    using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
+    TReg src = TReg(in_vec_idxs[0]);
+    TReg dst = TReg(out_vec_idxs[0]);
+
+    h->fsqrt(dst.s, src.s);
+}
+
+std::set<std::vector<element::Type>> jit_sqrt_emitter::get_supported_precisions(
+    const std::shared_ptr<ov::Node>& node) {
+    return {{element::f32}};
+}
+
 /// SUBTRACT ///
 jit_subtract_emitter::jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
                                            dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp
index ccd82bc5b628e7..afecd3029f58db 100644
--- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp
@@ -800,14 +800,34 @@ class jit_soft_sign_emitter : public jit_emitter {
     static std::set<std::vector<element::Type>> get_supported_precisions(const std::shared_ptr<ov::Node>& node = nullptr);
 
 private:
-    std::unique_ptr<jit_exp_emitter> exp_emitter;
-
     void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const override;
 
     template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
     void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
 };
 
+class jit_sqrt_emitter : public jit_emitter {
+public:
+    jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
+                     dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
+                     const ov::element::Type exec_prc = ov::element::f32);
+
+    jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
+                     dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
+                     const std::shared_ptr<ov::Node>& node);
+
+    size_t get_inputs_count() const override;
+
+    static std::set<std::vector<element::Type>> get_supported_precisions(
+        const std::shared_ptr<ov::Node>& node = nullptr);
+
+private:
+    void emit_impl(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs) const override;
+
+    template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs) const;
+};
+
 class jit_subtract_emitter : public jit_emitter {
 public:
     jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator *host,
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
index a17b8d28e17f5d..dc0f953efe70ab 100644
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -194,8 +194,8 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model> &model,
         const auto port = unusedOutput.get_index();
         const auto nodeName = std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName();
         const NodePtr outNode = std::make_shared<node::Input>(parentNode->outputShapes[port],
-                                                                        parentNode->getOriginalOutputPrecisionAtPort(port),
-                                                                        nodeName, "Result", m_context);
+                                                              parentNode->getOriginalOutputPrecisionAtPort(port),
+                                                              nodeName, "Result", m_context);
         CreateEdge(parentNode, outNode, port, 0);
         AddNode(outNode);
     }
diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
index 34e48dea50cbfa..7c23d55fc4147a 100644
--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@@ -655,7 +655,7 @@ std::vector<EdgePtr> Node::getChildEdgesAtPort(int inputNum) const {
         if (!edge)
             OPENVINO_THROW("Node ", getName(), " contains dead weak ptr");
         if (edge->getInputNum() == inputNum)
-            res.push_back(edge);
+            res.emplace_back(std::move(edge));
     }
     return res;
 }
@@ -793,11 +793,10 @@ void Node::redefineOutputMemory(const std::vector<VectorDims> &newOutputShapes)
 void Node::redefineOutputMemory(const size_t port, const VectorDims& new_output_shape) {
     const auto edges = getChildEdgesAtPort(port);
 
+    static const VectorDims single_element_shape = {1};
+
     // avoid 0D shape incompatible
-    auto new_shape = new_output_shape;
-    if (new_shape.empty()) {
-        new_shape.push_back(1);
-    }
+    const auto& new_shape = new_output_shape.empty() ? single_element_shape : new_output_shape;
 
     const auto& curr_desc = edges[0]->getMemory().getDesc();
     if (curr_desc.getShape().isStatic() && curr_desc.getShape().getStaticDims() == new_shape) {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp
index 7848e479f175e4..586e7f0705643f 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp
@@ -48,6 +48,7 @@ bool JitEltwiseExecutor::isSupported(
                                      Algorithm::EltwiseSelect,
                                      Algorithm::EltwiseSigmoid,
                                      Algorithm::EltwiseSoftSign,
+                                     Algorithm::EltwiseSqrt,
                                      Algorithm::EltwiseSubtract,
                                      Algorithm::EltwiseSwish,
                                      Algorithm::EltwiseTanh);
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp
index 59a5f812499481..98eb279bb26d48 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp
@@ -670,6 +670,7 @@ std::shared_ptr<jit_emitter> jit_uni_eltwise_generic<isa>::create_eltwise_emitte
     OV_CASE(Algorithm::EltwiseSelect, ov::intel_cpu::aarch64::jit_select_emitter),
     OV_CASE(Algorithm::EltwiseSigmoid, ov::intel_cpu::aarch64::jit_sigmoid_emitter),
     OV_CASE(Algorithm::EltwiseSoftSign, ov::intel_cpu::aarch64::jit_soft_sign_emitter),
+    OV_CASE(Algorithm::EltwiseSqrt, ov::intel_cpu::aarch64::jit_sqrt_emitter),
     OV_CASE(Algorithm::EltwiseSubtract, ov::intel_cpu::aarch64::jit_subtract_emitter),
     OV_CASE(Algorithm::EltwiseSwish, ov::intel_cpu::aarch64::jit_swish_emitter),
     OV_CASE(Algorithm::EltwiseTanh, ov::intel_cpu::aarch64::jit_tanh_emitter));
@@ -847,6 +848,7 @@ std::set<std::vector<element::Type>> eltwise_precision_helper::get_supported_pre
         OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter),
         OV_CASE(Algorithm::EltwiseSigmoid, jit_sigmoid_emitter),
         OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter),
+        OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter),
         OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter),
         OV_CASE(Algorithm::EltwiseSwish, jit_swish_emitter),
         OV_CASE(Algorithm::EltwiseTanh, jit_tanh_emitter));
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp
index a9998e88402ca7..d95f973fa9f2f0 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp
@@ -180,7 +180,6 @@ static void attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src,
     // For compatibility, all input_kvs are permuted to BHLS
     size_t B = k_src.m_dims[0], H = k_src.m_dims[1], L1 = k_src.m_dims[2], S = k_src.m_dims[3];
     // Internal LBHS layout has strides[L] > strides[B]
-    assert(k_src.m_strides[2] > k_src.m_strides[0]);
     parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) {
         auto p_k = k_scale_zp.ptr<float>(m, b, h);
         auto p_v = v_scale_zp.ptr<float>(m, b, h);
@@ -238,6 +237,8 @@ void attn_quantkv(const ov::intel_cpu::PlainTensor& k_src,
         attn_quant_mt<float, uint8_t>(k_src, v_src, k_dst, v_dst, k_scale_zp, v_scale_zp);
     } else if (k_src.get_precision() == ov::element::bf16 && k_dst.get_precision() == ov::element::u8) {
         attn_quant_mt<ov::bfloat16, uint8_t>(k_src, v_src, k_dst, v_dst, k_scale_zp, v_scale_zp);
+    } else if (k_src.get_precision() == ov::element::f16 && k_dst.get_precision() == ov::element::u8) {
+        attn_quant_mt<ov::float16, uint8_t>(k_src, v_src, k_dst, v_dst, k_scale_zp, v_scale_zp);
     } else {
         OPENVINO_THROW("unsupport src type: ", k_src.get_precision(), ", dst type: ", k_dst.get_precision(), " in attn_quantkv");
     }
@@ -252,6 +253,8 @@ void paged_attn_quantkv(const ov::intel_cpu::PlainTensor& k_src,
         paged_attn_quant_mt<float, uint8_t>(k_src, v_src, k_dst, v_dst, slot_mapping);
     } else if (k_src.get_precision() == ov::element::bf16 && k_dst.get_precision() == ov::element::u8) {
         paged_attn_quant_mt<ov::bfloat16, uint8_t>(k_src, v_src, k_dst, v_dst, slot_mapping);
+    } else if (k_src.get_precision() == ov::element::f16 && k_dst.get_precision() == ov::element::u8) {
+        paged_attn_quant_mt<ov::float16, uint8_t>(k_src, v_src, k_dst, v_dst, slot_mapping);
     } else {
         OPENVINO_THROW("unsupport src type: ", k_src.get_precision(), ", dst type: ", k_dst.get_precision(), " in paged_attn_quantkv");
     }
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp
index 3341f6f6082d99..2956c8a6a6b5b8 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp
@@ -37,15 +37,22 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);
         return _mm512_castsi512_ps(_mm512_slli_epi32(y, 16));
     }
 
+    // load addr to __m512 reg
+    inline __m512 mm512_uni_loadu_ps(const float* a) {
+        return _mm512_loadu_ps(a);
+    }
+
     inline __m512 mm512_uni_loadu_ps(const ov::bfloat16* a) {
         auto vec_bf16 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a));
         return cvt_bf16_to_fp32(vec_bf16);
     }
 
-    inline __m512 mm512_uni_loadu_ps(const float* a) {
-        return _mm512_loadu_ps(a);
+    inline __m512 mm512_uni_loadu_ps(const ov::float16* a) {
+        auto vec_f16 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a));
+        return _mm512_cvtph_ps(vec_f16);
     }
 
+    // load addr to __m512 reg
     inline __m512 mm512_uni_loadu_tail_ps(const float* a, size_t count) {
         __mmask16 mask = (1 << count) - 1;
         return _mm512_maskz_loadu_ps(mask, a);
@@ -57,6 +64,13 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);
         return cvt_bf16_to_fp32(bf16_vec);
     }
 
+    inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) {
+        auto mask = (1 << count) - 1;
+        auto f16_vec = _mm256_maskz_loadu_epi16(mask, a);
+        return _mm512_cvtph_ps(f16_vec);
+    }
+
+    // store __m512 reg to addr
     inline void mm512_uni_storeu_ps(float* a,  __m512 v) {
         _mm512_storeu_ps(a, v);
     }
@@ -72,6 +86,13 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);
         x = _mm512_mask_blend_epi32(mask, nan, x);                     // Check NaN before converting back to bf16
         _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), _mm512_cvtepi32_epi16(x));
     }
+
+    inline void mm512_uni_storeu_ps(ov::float16* addr,  __m512 v) {
+        __m256i vec_f16 = _mm512_cvtps_ph(v, 0);
+        _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), vec_f16);
+    }
+
+    // store __m512 reg to addr
     inline void mm512_uni_mask_storeu_ps(ov::bfloat16 *addr, __mmask16 mask_addr, __m512 xps) {
         __m512i xpi32 = _mm512_castps_si512(xps);
         __m512i nan = _mm512_set1_epi32(0xffff);
@@ -85,18 +106,29 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);
         _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x);
     }
 
-    inline __m512 mm512_uni_loadu_ps(ov::float16* a) {
-        auto vec_f16 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a));
-        return _mm512_cvtph_ps(vec_f16);
+    inline void mm512_uni_storeu_tail_ps(float *addr, __m512 v, size_t count) {
+        __mmask16 mask_addr = (1 << count) - 1;
+        _mm512_mask_storeu_ps(addr, mask_addr, v);
     }
-    inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) {
-        auto mask = (1 << count) - 1;
-        auto f16_vec = _mm256_maskz_loadu_epi16(mask, a);
-        return _mm512_cvtph_ps(f16_vec);
+
+    inline void mm512_uni_storeu_tail_ps(ov::bfloat16 *addr, __m512 v, size_t count) {
+        __mmask16 mask_addr = (1 << count) - 1;
+        __m512i xpi32 = _mm512_castps_si512(v);
+        __m512i nan = _mm512_set1_epi32(0xffff);
+        auto mask = _mm512_cmp_ps_mask(v, v, _CMP_ORD_Q);
+        __m512i ones = _mm512_set1_epi32(0x1);
+        __m512i vec_bias = _mm512_set1_epi32(0x7fff);
+        auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16]
+        x = _mm512_add_epi32(x, vec_bias);                             // rounding_bias = 0x7fff + LSB
+        x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16);         // x = (x + rounding_bias) >> 16;
+        x = _mm512_mask_blend_epi32(mask, nan, x);                     // Check NaN before converting back to bf16
+        _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x);
     }
-    inline void mm512_uni_storeu_ps(ov::float16* addr,  __m512 v) {
+
+    inline void mm512_uni_storeu_tail_ps(ov::float16 *addr, __m512 v, size_t count) {
+        __mmask16 mask_addr = (1 << count) - 1;
         __m256i vec_f16 = _mm512_cvtps_ph(v, 0);
-        _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), vec_f16);
+        _mm256_mask_storeu_epi16(reinterpret_cast<__m256i *>(addr), mask_addr, vec_f16);
     }
 #endif
 
@@ -115,12 +147,11 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);
         };
         return _mm256_loadu_si256(&mask[N7]);
     }
+
+    // load addr to __m256 reg
     inline __m256 mm256_uni_loadu_ps(const float* a) {
         return _mm256_loadu_ps(a);
     }
-    inline void mm256_uni_storeu_ps(float* a,  __m256 v) {
-        _mm256_storeu_ps(a, v);
-    }
 
     inline __m256 mm256_uni_loadu_ps(const ov::bfloat16* a) {
         auto vec_bf16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(a));
@@ -128,6 +159,13 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);
         return o;
     }
 
+    inline __m256 mm256_uni_loadu_ps(const ov::float16* a) {
+        auto vec_f16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(a));
+        auto o = _mm256_cvtph_ps(vec_f16);
+        return o;
+    }
+
+    // load addr tail to __m256 reg
     inline __m256 mm256_uni_loadu_tail_ps(const float* a, const size_t count) {
         auto mask = get_mask(count);
         return _mm256_maskload_ps(a, mask);
@@ -140,6 +178,17 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);
         return mm256_uni_loadu_ps(tmp_values);
     }
 
+    inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) {
+        ov::float16 tmp_values[8] = {0};
+        std::memcpy(tmp_values, a, count * sizeof(ov::float16));
+        return mm256_uni_loadu_ps(tmp_values);
+    }
+
+    // store __m256 reg to addr
+    inline void mm256_uni_storeu_ps(float* a,  __m256 v) {
+        _mm256_storeu_ps(a, v);
+    }
+
     inline void mm256_uni_storeu_ps(ov::bfloat16 *addr, __m256 xps) {
         __m256i xpi32 = _mm256_castps_si256(xps);
         __m256i nan = _mm256_set1_epi32(0xffff);
@@ -156,21 +205,17 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);
         _mm_storeu_si128(reinterpret_cast<__m128i *>(addr), bf16_o);
     }
 
-    inline __m256 mm256_uni_loadu_ps(ov::float16* a) {
-        auto vec_f16 = _mm_loadu_si128(reinterpret_cast<__m128i*>(a));
-        auto o = _mm256_cvtph_ps(vec_f16);
-        return o;
-    }
-    inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) {
-        ov::float16 tmp_values[8] = {0};
-        std::memcpy(tmp_values, a, count * sizeof(ov::float16));
-        return mm256_uni_loadu_ps(tmp_values);
-    }
     inline void mm256_uni_storeu_ps(ov::float16* a,  __m256 v) {
         __m128i vec_f16 = _mm256_cvtps_ph(v, 0);
         _mm_storeu_si128(reinterpret_cast<__m128i *>(a), vec_f16);
     }
 
+    // store __m256 to addr
+    inline void mm256_uni_storeu_tail_ps(float *addr, __m256 v, size_t count) {
+        const auto mask = get_mask(count);
+        return _mm256_maskstore_ps(addr, mask, v);
+    }
+
     inline void hsum(__m256& x) {
         __m256 y;                             // x:  0 1 2 3   4 5 6 7
         y = _mm256_permute_ps(x, 0x39);       // y:  1 2 3 0   5 6 7 4
@@ -292,4 +337,4 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);
 }  // namespace XARCH
 }  // namespace Cpu
 }  // namespace Extensions
-}  // namespace ov
\ No newline at end of file
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
index 1fe7b811b922a8..971aa6bb58c994 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
@@ -14,6 +14,7 @@
 #endif
 
 #include "openvino/core/type/bfloat16.hpp"
+#include "openvino/core/type/float16.hpp"
 #include "openvino/core/parallel.hpp"
 #include "executor_pa.hpp"
 #include "executor_pa_common.hpp"
@@ -619,7 +620,8 @@ void transpose_16NxK(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t
 }
 
 #if defined(HAVE_AVX512F)
-static void transpose_16NxK(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+template<typename T, typename = typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
+static void transpose_16NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     // will treat as uint32_t transpose
     auto s = reinterpret_cast<uint32_t*>(src);
     auto d = reinterpret_cast<uint32_t*>(dst);
@@ -669,8 +671,8 @@ void dequant(TDST* dst, uint8_t* src, size_t N, size_t K) {
 }
 
 #if defined(HAVE_AVX512F)
-// pack bf16/u8 to bf16
-static void pack_32x32_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_stride, size_t src_stride) {
+template<typename T, typename = typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
+static void pack_32x32_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride) {
     static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
     auto midx = _mm512_loadu_si512(idx);
     for (size_t i = 0; i < 16; i++) {
@@ -687,7 +689,8 @@ static void pack_32x32_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_s
     }
 }
 
-static void pack_32x16_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_stride, size_t src_stride) {
+template<typename T, typename = typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
+static void pack_32x16_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride) {
     static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
     auto midx = _mm512_loadu_si512(idx);
     for (size_t i = 0; i < 16; i++) {
@@ -704,7 +707,8 @@ static void pack_32x16_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_s
     }
 }
 
-static void pack_32Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+template<typename T, typename = typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
+static void pack_32Nx16K(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     for (size_t n = 0; n < N; n += 32) {
         size_t k = 0;
         for (; k + 32 <= K; k += 32) {
@@ -718,7 +722,8 @@ static void pack_32Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp
     }
 }
 
-static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+template<typename T, typename = typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
+static void pack_32Nx16K(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     // The layout for per token per head:
     // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)|
     // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float)
@@ -730,7 +735,7 @@ static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, siz
         s += src_stride + 2 * sizeof(float);
         t += src_stride;
     }
-    pack_32Nx16K(dst, tmp, reinterpret_cast<ov::bfloat16*>(0), N, K, dst_stride, src_stride);
+    pack_32Nx16K(dst, tmp, reinterpret_cast<T*>(0), N, K, dst_stride, src_stride);
 }
 #endif
 
@@ -769,7 +774,7 @@ struct MHAHelper {
     std::vector<std::shared_ptr<BrgemmKernel>> _wv_gemm_acc;
     // second token
     std::shared_ptr<JitMatMulVecAMX> _gemv;
-    bool _fastpath_valid = false;
+    ov::element::Type _fastpath_valid_prec = ov::element::undefined;
     // second token for bhl loop
     PlainTensor _weight_bhl;
     PlainTensor _output_bhl;
@@ -851,11 +856,20 @@ struct MHAHelper {
             _qk_scratch_a.resize<DATA_TYPE>({_nthr, _qk_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)});
             _wv_scratch_a.resize<DATA_TYPE>({_nthr, _wv_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)});
 
-            _fastpath_valid = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_bf16) &&
-                (S % 32 == 0) && (block_size % 16 == 0) && (S <= 32 * 6) && precision_of<KVCACHE_TYPE>::value == ov::element::bf16;
-            // aligned to cache line (64bytes=16*sizeof(float)) to avoid false sharing
-            if (_fastpath_valid && !_gemv)
-                _gemv = std::make_shared<JitMatMulVecAMX>(static_cast<int>(S), static_cast<int>(block_size));
+            if ((S % 32 == 0) && (block_size % 16 == 0) && (S <= 32 * 6)) {
+                if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_bf16) &&
+                    precision_of<DATA_TYPE>::value == ov::element::bf16 &&
+                    precision_of<KVCACHE_TYPE>::value == ov::element::bf16) {
+                    _fastpath_valid_prec = ov::element::bf16;
+                } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_fp16) &&
+                           precision_of<DATA_TYPE>::value == ov::element::f16 &&
+                           precision_of<KVCACHE_TYPE>::value == ov::element::f16) {
+                    _fastpath_valid_prec = ov::element::f16;
+                }
+            }
+            if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16) && !_gemv) {
+                _gemv = std::make_shared<JitMatMulVecAMX>(static_cast<int>(S), static_cast<int>(block_size), _fastpath_valid_prec);
+            }
         }
 
         if (init_alibi_lookup && (!_alibi_lookup || _alibi_lookup.m_dims[0] < kv_len)) {
@@ -903,7 +917,7 @@ struct MHAHelper {
         auto q_start = q_blk * _block_size;
         auto q_end = std::min(q_start + _block_size, q_len);
         auto q_cnt = q_end - q_start;
-        constexpr bool q_is_bf16 = precision_of<DATA_TYPE>::value == ov::element::bf16;
+        constexpr bool q_is_xf16 = one_of(precision_of<DATA_TYPE>::value, ov::element::bf16, ov::element::f16);
         constexpr bool q_cache_is_same = precision_of<DATA_TYPE>::value == precision_of<KVCACHE_TYPE>::value;
         auto cur_kv_len_blocks = div_up(cur_kv_len, _block_size);
         for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
@@ -978,12 +992,12 @@ struct MHAHelper {
 
             // reuse float buffer, need to use float to compute offset
             auto* w_ptr = reinterpret_cast<DATA_TYPE*>(_weight.ptr<float>(ithr, h, 0, 0));
-            float* fp32_out_ptr = q_is_bf16 ? _output.ptr<float>(ithr, 0, h, 0) : output_emb.ptr<float>(q_start, h * _S);
+            float* fp32_out_ptr = q_is_xf16 ? _output.ptr<float>(ithr, 0, h, 0) : output_emb.ptr<float>(q_start, h * _S);
 
             // for each weight block, loop through all value block
             for (size_t v_blk = 0; v_blk < cur_kv_len_blocks; v_blk++) {
                 DATA_TYPE* v_ptr;
-                if (q_is_bf16 || !q_cache_is_same) {
+                if (q_is_xf16 || !q_cache_is_same) {
                     v_ptr = wv_scratch_b.ptr<DATA_TYPE>(v_blk, hk);
                 } else {
                     v_ptr = present_value.ptr<DATA_TYPE>(block_table[v_blk], hk);
@@ -1004,11 +1018,11 @@ struct MHAHelper {
                                                          _wv_scratch_a ? _wv_scratch_a.ptr<DATA_TYPE>(ithr, 0) : nullptr);
                 }
             }
-            if (q_is_bf16) {
+            if (q_is_xf16) {
                 attn_memcpy2d_kernel(_output.ptr<float>(ithr, 0, h, 0),
                                      output_emb.ptr<DATA_TYPE>(q_start, h * _S),
                                      ov::element::f32,
-                                     ov::element::bf16,
+                                     precision_of<DATA_TYPE>::value,
                                      _output.stride(1),
                                      output_emb.stride(0),
                                      _S,
@@ -1026,13 +1040,13 @@ struct MHAHelper {
     //  output: [nthr, 32, H, S]
     void exec_kernel_one_bh(const PlainTensor& query, const PlainTensor& present_key, const PlainTensor& present_value, const PlainTensor& output_emb,
         const int32_t* block_table, size_t ithr, size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) {
-        if (_fastpath_valid) {
+        if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) {
             _gemv->tile_config();
             for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) {
                 auto block_number = block_table[i];
                 for (size_t pq = 0; pq < q_len; pq++) {
                     for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
-                        (*_gemv)(query.ptr<ov::bfloat16>(h, pq), present_key.ptr<ov::bfloat16>(block_number, hk),
+                        (*_gemv)(query.ptr<DATA_TYPE>(h, pq), present_key.ptr<KVCACHE_TYPE>(block_number, hk),
                             _weight.ptr<float>(ithr, h, pq) + pk);
                     }
                 }
@@ -1128,11 +1142,11 @@ struct MHAHelper {
             auto pk = pk_in_blocks * _block_size;
             if (pk < context_len) {
                 auto block_number = block_indices.ptr<int32_t>()[block_indices_begins.ptr<int32_t>()[b] + pk_in_blocks];
-                if (_fastpath_valid) {
+                if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) {
                     _gemv->tile_config();
                     for (size_t pq = 0; pq < q_len; pq++) {
                         for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) {
-                            (*_gemv)(query.ptr<ov::bfloat16>(b, h, pq), present_key.ptr<ov::bfloat16>(block_number, hk),
+                            (*_gemv)(query.ptr<DATA_TYPE>(b, h, pq), present_key.ptr<KVCACHE_TYPE>(block_number, hk),
                                 _weight_bhl.ptr<float>(b, h, pq) + pk);
                         }
                     }
@@ -1334,7 +1348,7 @@ struct MHA {
                          const PlainTensor& alibi_slopes) {
         auto Hk = v_cache.m_dims[1];
 
-        constexpr bool q_is_bf16 = precision_of<DATA_TYPE>::value == ov::element::bf16;
+        constexpr bool q_is_xf16 = one_of(precision_of<DATA_TYPE>::value, ov::element::bf16, ov::element::f16);
         constexpr bool q_cache_is_same = precision_of<DATA_TYPE>::value == precision_of<KVCACHE_TYPE>::value;
         auto attn_work_count = _workitems.attn_work_size();
         auto reorder_work_count = _workitems.reorder_work_size();
@@ -1360,7 +1374,7 @@ struct MHA {
                 _helper._output.template ptr<DATA_TYPE>(ithr),
                 _helper._block_size,
                 _helper._S, _helper._block_size, _helper._S);
-            if (q_is_bf16) {
+            if (q_is_xf16) {
                 pack_32Nx16K(_helper._wv_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
                     v_ptr,
                     _helper._output.template ptr<DATA_TYPE>(ithr),
@@ -1604,6 +1618,17 @@ std::shared_ptr<PagedAttentionExecutor> make_pa_executor(ov::element::Type data_
         }
 #else
         OPENVINO_THROW("make_pa_executor: bf16 needs avx512+ hardware.");
+#endif
+    } else if (data_type == ov::element::f16) {
+#if defined(HAVE_AVX512F)
+        if (kvcache_type == ov::element::u8) {
+            executor = std::make_shared<AttentionExecutor<ov::float16, uint8_t>>();
+        } else {
+            OPENVINO_ASSERT(kvcache_type == ov::element::f16, "expect kvcache type f16, current: ", kvcache_type);
+            executor = std::make_shared<AttentionExecutor<ov::float16, ov::float16>>();
+        }
+#else
+     OPENVINO_THROW("make_pa_executor: f16 needs avx512+ hardware.");
 #endif
     } else if (data_type == ov::element::f32) {
         if (kvcache_type == ov::element::u8) {
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp
index 63a8a0f7d24062..70723a577b0c2b 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp
@@ -10,6 +10,7 @@
 #include <type_traits>
 
 #include "openvino/core/type/bfloat16.hpp"
+#include "openvino/core/type/float16.hpp"
 #include "openvino/core/parallel.hpp"
 #include "executor_pa_common.hpp"
 #include "utils/plain_tensor.hpp"
@@ -57,7 +58,8 @@ void TileConfiger::generate() {
     ret();
 }
 
-JitMatMulVecAMX::JitMatMulVecAMX(int head_size, int block_size) : jit_generator(jit_name()), m_head_size(head_size), m_block_size(block_size) {
+JitMatMulVecAMX::JitMatMulVecAMX(int head_size, int block_size, ov::element::Type amx_prec) :
+    jit_generator(jit_name()), m_head_size(head_size), m_block_size(block_size), m_amx_prec(amx_prec) {
     create_kernel();
     m_tile_cfg.reset(1,
                      0,
@@ -98,7 +100,11 @@ void JitMatMulVecAMX::generate() {
         tilezero(tmmC);
         for (int i = 0; i < num_B_tiles; i++) {
             tileloadd(tmmA, ptr[reg_k_addr + reg_stride_A + i * 64]);
-            tdpbf16ps(tmmC, tmmA, Xbyak::Tmm(tmmB0.getIdx() + i));
+            if (m_amx_prec == ov::element::bf16) {
+                tdpbf16ps(tmmC, tmmA, Xbyak::Tmm(tmmB0.getIdx() + i));
+            } else if (m_amx_prec == ov::element::f16) {
+                tdpfp16ps(tmmC, tmmA, Xbyak::Tmm(tmmB0.getIdx() + i));
+            }
         }
         tilestored(ptr[reg_dst_addr + reg_stride_BC + m * sizeof(float)], tmmC);
         add(reg_k_addr, m_head_size * 2 * 16);
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp
index 237860ec692e76..bc21457a3285b4 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp
@@ -69,9 +69,10 @@ class JitMatMulVecAMX : public dnnl::impl::cpu::x64::jit_generator {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(JitMatMulVecAMX)
     int m_head_size;
     int m_block_size;
+    ov::element::Type m_amx_prec;
     TileConfiger m_tile_configer;
     TileConfig m_tile_cfg;
-    JitMatMulVecAMX(int head_size, int block_size);
+    JitMatMulVecAMX(int head_size, int block_size, ov::element::Type amx_prec);
 
     void tile_config() {
         m_tile_configer(&m_tile_cfg);
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp
index 3ce275d47e3d9d..0670c744a6da91 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp
@@ -841,20 +841,20 @@ static void attn_reduce(ov::float16* dst, ov::float16* temp, size_t M, size_t S,
 
 template <typename T, typename T2, typename T3>
 static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
-                             const ov::intel_cpu::PlainTensor& present_key,
-                             const ov::intel_cpu::PlainTensor& present_value,
-                             const ov::intel_cpu::PlainTensor& alibi_mask,
-                             const ov::intel_cpu::PlainTensor& attention_mask,
-                             const ov::intel_cpu::PlainTensor& beams,
-                             ov::intel_cpu::PlainTensor& output_emb,
-                             ov::intel_cpu::PlainTensor& buf_attn_w,
-                             ov::intel_cpu::PlainTensor& buf_attn_score,
-                             bool has_out_transpose,
-                             bool auto_causal,
-                             float d_scale,
-                             const ov::intel_cpu::PlainTensor& past_k_scale_zp,
-                             const ov::intel_cpu::PlainTensor& past_v_scale_zp,
-                             ov::intel_cpu::PlainTensor& head_sum) {
+                                    const ov::intel_cpu::PlainTensor& present_key,
+                                    const ov::intel_cpu::PlainTensor& present_value,
+                                    const ov::intel_cpu::PlainTensor& alibi_mask,
+                                    const ov::intel_cpu::PlainTensor& attention_mask,
+                                    const ov::intel_cpu::PlainTensor& beams,
+                                    ov::intel_cpu::PlainTensor& output_emb,
+                                    ov::intel_cpu::PlainTensor& buf_attn_w,
+                                    ov::intel_cpu::PlainTensor& buf_attn_score,
+                                    bool has_out_transpose,
+                                    bool auto_causal,
+                                    float d_scale,
+                                    const ov::intel_cpu::PlainTensor& past_k_scale_zp,
+                                    const ov::intel_cpu::PlainTensor& past_v_scale_zp,
+                                    ov::intel_cpu::PlainTensor& head_sum) {
     ov::intel_cpu::PlainTensor causal_mask;
     bool select_nfltmax_at_0 = false;
     auto B = query.size(0);
@@ -976,16 +976,16 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
             attn_mask_ptr = reinterpret_cast<uint8_t*>(&attention_mask.at<T>({b, h, pq, 0}, true));
         uint8_t* cmask_ptr = causal_mask ? &causal_mask.at<uint8_t>({b, h, pq, 0}, true) : nullptr;
         attn_softmax_kernel<T3>(buf_attn_w.ptr<T3>(b, h, pq),
-                        buf_attn_w.ptr<T3>(b, h, pq),
-                        d_scale,
-                        alibi_ptr,
-                        attn_mask_ptr,
-                        cmask_ptr,
-                        select_nfltmax_at_0,
-                        ncausal,
-                        cur_kv_len,
-                        attn_mask_prec,
-                        precision);
+                                buf_attn_w.ptr<T3>(b, h, pq),
+                                d_scale,
+                                alibi_ptr,
+                                attn_mask_ptr,
+                                cmask_ptr,
+                                select_nfltmax_at_0,
+                                ncausal,
+                                cur_kv_len,
+                                attn_mask_prec,
+                                precision);
     });
 
     // attn_w * V
@@ -1054,11 +1054,11 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query,
                     for (size_t pq = 0; pq < q_len; pq++) {
                         for (size_t h = h_group * h_each_group_len; h < (h_group + 1) * h_each_group_len; h++) {
                             attn_acc_value(buf_attn_score.ptr<T3>(ithr, b, pq, h),
-                                        buf_attn_w.ptr<T3>(b, h, pq)[pv],
-                                        v,
-                                        S,
-                                        p + 0,
-                                        p + 1);
+                                           buf_attn_w.ptr<T3>(b, h, pq)[pv],
+                                           v,
+                                           S,
+                                           p + 0,
+                                           p + 1);
                         }
                     }
                     parallel_it_step(pv, kv_len, b, B, h_group, h_group_num);
@@ -1093,86 +1093,36 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query,
     if (query.get_precision() == ov::element::bf16) {
         if (present_key.get_precision() == ov::element::u8) {
             mha_single_token_kernel<ov::bfloat16, uint8_t, float>(query,
-                                                           present_key,
-                                                           present_value,
-                                                           alibi_mask,
-                                                           attention_mask,
-                                                           beams,
-                                                           output_emb,
-                                                           buf_attn_w,
-                                                           buf_attn_score,
-                                                           has_out_transpose,
-                                                           auto_causal,
-                                                           d_scale,
-                                                           past_k_scale_zp,
-                                                           past_v_scale_zp,
-                                                           head_sum);
+                                                                  present_key,
+                                                                  present_value,
+                                                                  alibi_mask,
+                                                                  attention_mask,
+                                                                  beams,
+                                                                  output_emb,
+                                                                  buf_attn_w,
+                                                                  buf_attn_score,
+                                                                  has_out_transpose,
+                                                                  auto_causal,
+                                                                  d_scale,
+                                                                  past_k_scale_zp,
+                                                                  past_v_scale_zp,
+                                                                  head_sum);
         } else {
             mha_single_token_kernel<ov::bfloat16, ov::bfloat16, float>(query,
-                                                                present_key,
-                                                                present_value,
-                                                                alibi_mask,
-                                                                attention_mask,
-                                                                beams,
-                                                                output_emb,
-                                                                buf_attn_w,
-                                                                buf_attn_score,
-                                                                has_out_transpose,
-                                                                auto_causal,
-                                                                d_scale,
-                                                                past_k_scale_zp,
-                                                                past_v_scale_zp,
-                                                                head_sum);
-        }
-    } else if (query.get_precision() == ov::element::f32) {
-        if (present_key.get_precision() == ov::element::u8) {
-            mha_single_token_kernel<float, uint8_t, float>(query,
-                                                    present_key,
-                                                    present_value,
-                                                    alibi_mask,
-                                                    attention_mask,
-                                                    beams,
-                                                    output_emb,
-                                                    buf_attn_w,
-                                                    buf_attn_score,
-                                                    has_out_transpose,
-                                                    auto_causal,
-                                                    d_scale,
-                                                    past_k_scale_zp,
-                                                    past_v_scale_zp,
-                                                    head_sum);
-        } else if (present_key.get_precision() == ov::element::f16) {
-            mha_single_token_kernel<float, ov::float16, float>(query,
-                                                        present_key,
-                                                        present_value,
-                                                        alibi_mask,
-                                                        attention_mask,
-                                                        beams,
-                                                        output_emb,
-                                                        buf_attn_w,
-                                                        buf_attn_score,
-                                                        has_out_transpose,
-                                                        auto_causal,
-                                                        d_scale,
-                                                        past_k_scale_zp,
-                                                        past_v_scale_zp,
-                                                        head_sum);
-        } else {
-            mha_single_token_kernel<float, float, float>(query,
-                                                present_key,
-                                                present_value,
-                                                alibi_mask,
-                                                attention_mask,
-                                                beams,
-                                                output_emb,
-                                                buf_attn_w,
-                                                buf_attn_score,
-                                                has_out_transpose,
-                                                auto_causal,
-                                                d_scale,
-                                                past_k_scale_zp,
-                                                past_v_scale_zp,
-                                                head_sum);
+                                                                       present_key,
+                                                                       present_value,
+                                                                       alibi_mask,
+                                                                       attention_mask,
+                                                                       beams,
+                                                                       output_emb,
+                                                                       buf_attn_w,
+                                                                       buf_attn_score,
+                                                                       has_out_transpose,
+                                                                       auto_causal,
+                                                                       d_scale,
+                                                                       past_k_scale_zp,
+                                                                       past_v_scale_zp,
+                                                                       head_sum);
         }
     } else if (query.get_precision() == ov::element::f16) {
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -1196,8 +1146,90 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query,
             OPENVINO_THROW("Unsupported precision: ", query.get_precision());
         }
 #else
-        OPENVINO_THROW("Unsupported precision: ", query.get_precision());
+        if (present_key.get_precision() == ov::element::u8) {
+            mha_single_token_kernel<ov::float16, uint8_t, float>(query,
+                                                                 present_key,
+                                                                 present_value,
+                                                                 alibi_mask,
+                                                                 attention_mask,
+                                                                 beams,
+                                                                 output_emb,
+                                                                 buf_attn_w,
+                                                                 buf_attn_score,
+                                                                 has_out_transpose,
+                                                                 auto_causal,
+                                                                 d_scale,
+                                                                 past_k_scale_zp,
+                                                                 past_v_scale_zp,
+                                                                 head_sum);
+        } else {
+            mha_single_token_kernel<ov::float16, ov::float16, float>(query,
+                                                                     present_key,
+                                                                     present_value,
+                                                                     alibi_mask,
+                                                                     attention_mask,
+                                                                     beams,
+                                                                     output_emb,
+                                                                     buf_attn_w,
+                                                                     buf_attn_score,
+                                                                     has_out_transpose,
+                                                                     auto_causal,
+                                                                     d_scale,
+                                                                     past_k_scale_zp,
+                                                                     past_v_scale_zp,
+                                                                     head_sum);
+        }
 #endif
+    } else if (query.get_precision() == ov::element::f32) {
+        if (present_key.get_precision() == ov::element::u8) {
+            mha_single_token_kernel<float, uint8_t, float>(query,
+                                                           present_key,
+                                                           present_value,
+                                                           alibi_mask,
+                                                           attention_mask,
+                                                           beams,
+                                                           output_emb,
+                                                           buf_attn_w,
+                                                           buf_attn_score,
+                                                           has_out_transpose,
+                                                           auto_causal,
+                                                           d_scale,
+                                                           past_k_scale_zp,
+                                                           past_v_scale_zp,
+                                                           head_sum);
+        } else if (present_key.get_precision() == ov::element::f16) {
+            mha_single_token_kernel<float, ov::float16, float>(query,
+                                                               present_key,
+                                                               present_value,
+                                                               alibi_mask,
+                                                               attention_mask,
+                                                               beams,
+                                                               output_emb,
+                                                               buf_attn_w,
+                                                               buf_attn_score,
+                                                               has_out_transpose,
+                                                               auto_causal,
+                                                               d_scale,
+                                                               past_k_scale_zp,
+                                                               past_v_scale_zp,
+                                                               head_sum);
+        } else {
+            mha_single_token_kernel<float, float, float>(query,
+                                                         present_key,
+                                                         present_value,
+                                                         alibi_mask,
+                                                         attention_mask,
+                                                         beams,
+                                                         output_emb,
+                                                         buf_attn_w,
+                                                         buf_attn_score,
+                                                         has_out_transpose,
+                                                         auto_causal,
+                                                         d_scale,
+                                                         past_k_scale_zp,
+                                                         past_v_scale_zp,
+                                                         head_sum);
+        }
     } else {
         OPENVINO_THROW("Unsupported precision: ", query.get_precision());
     }
@@ -1205,4 +1237,4 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query,
 }  // namespace XARCH
 }  // namespace Cpu
 }  // namespace Extensions
-}  // namespace ov
\ No newline at end of file
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp
index bffe0ee3761dd5..60c6a24ec5f2fa 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp
@@ -751,14 +751,14 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_
     while (i + vec_len_f32_avx512 <= size) {
         v_a = _mm512_loadu_ps(a + i);
         v_a = _mm512_mul_ps(v_a, v_scale);
-        _mm512_storeu_ps(a_dst + i, v_a);
+        mm512_uni_storeu_ps(a_dst + i, v_a);
         i += vec_len_f32_avx512;
     }
     if (i < size) {
         __mmask16 mask = (1 << (size - i)) - 1;
         v_a = _mm512_maskz_loadu_ps(mask, a + i);
         v_a = _mm512_mul_ps(v_a, v_scale);
-        _mm512_mask_storeu_ps(a_dst + i, mask, v_a);
+        mm512_uni_storeu_tail_ps(a_dst + i, v_a, size - i);
 
         i += (size - i);
     }
@@ -768,14 +768,14 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_
     while (i + vec_len_f32_avx2 <= size) {
         v_a = _mm256_loadu_ps(a + i);
         v_a = _mm256_mul_ps(v_a, v_scale);
-        _mm256_storeu_ps(a_dst + i, v_a);
+        mm256_uni_storeu_ps(a_dst + i, v_a);
         i += vec_len_f32_avx2;
     }
     if (i < size) {
         auto mask = get_mask(size - i);
         v_a = _mm256_maskload_ps(a + i, mask);
         v_a = _mm256_mul_ps(v_a, v_scale);
-        _mm256_maskstore_ps(a_dst + i, mask, v_a);
+        mm256_uni_storeu_tail_ps(a_dst + i, v_a, size - i);
 
         i += (size - i);
     }
@@ -793,11 +793,12 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_
     }
 }
 
-inline void multiply_scalar(float* a, ov::bfloat16* a_dst, const float val, const size_t size) {
+template<typename T, typename = typename std::enable_if<(std::is_same<T, ov::bfloat16>::value || std::is_same<T, ov::float16>::value), bool>::type>
+inline void multiply_scalar(float* a, T* a_dst, const float val, const size_t size) {
+    size_t i = 0;
 #if defined(HAVE_AVX512F)
     auto v_scale = _mm512_set1_ps(val);
     __m512 v_a = {0};
-    size_t i = 0;
     while (i + vec_len_f32_avx512 <= size) {
         v_a = _mm512_loadu_ps(a + i);
         v_a = _mm512_mul_ps(v_a, v_scale);
@@ -808,10 +809,12 @@ inline void multiply_scalar(float* a, ov::bfloat16* a_dst, const float val, cons
         __mmask16 mask = (1 << (size - i)) - 1;
         v_a = _mm512_maskz_loadu_ps(mask, a + i);
         v_a = _mm512_mul_ps(v_a, v_scale);
-        mm512_uni_mask_storeu_ps(a_dst + i, mask, v_a);
+        mm512_uni_storeu_tail_ps(a_dst + i, v_a, size - i);
+
+        i += (size - i);
     }
 #else
-    for (size_t i = 0; i < size; i++) {
+    for (; i < size; i++) {
         a_dst[i] = a[i] * val;
     }
 #endif
@@ -898,6 +901,7 @@ inline void attn_softmax_kernel<float>(float* a,
                                        float alibi_slope) {
     using func_fp32_type = void (*)(float*, float, const float*, const float*, const uint8_t*, bool, size_t, float, float&);
     using func_bf16_type = void (*)(float*, float, const float*, const ov::bfloat16*, const uint8_t*, bool, size_t, float, float&);
+    using func_f16_type = void (*)(float*, float, const float*, const ov::float16*, const uint8_t*, bool, size_t, float, float&);
     static constexpr func_fp32_type funcs_fp32[] = {
         scale_add2_reduce_max<false, false, false>,
         scale_add2_reduce_max<false, false, true>,
@@ -918,12 +922,24 @@ inline void attn_softmax_kernel<float>(float* a,
         scale_add2_reduce_max<true, true, false>,
         scale_add2_reduce_max<true, true, true>
     };
+    static constexpr func_f16_type funcs_f16[] = {
+        scale_add2_reduce_max<false, false, false>,
+        scale_add2_reduce_max<false, false, true>,
+        scale_add2_reduce_max<false, true, false>,
+        scale_add2_reduce_max<false, true, true>,
+        scale_add2_reduce_max<true, false, false>,
+        scale_add2_reduce_max<true, false, true>,
+        scale_add2_reduce_max<true, true, false>,
+        scale_add2_reduce_max<true, true, true>
+    };
     int dispatch = (alibi ? 0b100 : 0) | (attn_mask ? 0b010 : 0) | (causal_mask ? 0b001 : 0);
     float max = std::numeric_limits<float>::lowest();
     if (attn_mask_prec == ov::element::f32) {
         funcs_fp32[dispatch](a, scale, alibi, static_cast<const float*>(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max);
-    } else {
+    } else if (attn_mask_prec == ov::element::bf16) {
         funcs_bf16[dispatch](a, scale, alibi, static_cast<const ov::bfloat16*>(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max);
+    } else {
+        funcs_f16[dispatch](a, scale, alibi, static_cast<const ov::float16*>(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max);
     }
 
     float sum = 0.0f;
@@ -936,11 +952,16 @@ inline void attn_softmax_kernel<float>(float* a,
         // apply causual mask to final result instead of attn_score
         if (total_size > len)
             memset(static_cast<float*>(a_dst) + len, 0, sizeof(float) * (total_size - len));
-    } else {
+    } else if (dst_precision == ov::element::bf16) {
         multiply_scalar(a, static_cast<ov::bfloat16*>(a_dst), scalar, len);
         // apply causual mask to final result instead of attn_score
         if (total_size > len)
             memset(static_cast<ov::bfloat16*>(a_dst) + len, 0, sizeof(ov::bfloat16) * (total_size - len));
+    } else {
+        multiply_scalar(a, static_cast<ov::float16*>(a_dst), scalar, len);
+        // apply causual mask to final result instead of attn_score
+        if (total_size > len)
+            memset(static_cast<ov::float16*>(a_dst) + len, 0, sizeof(ov::float16) * (total_size - len));
     }
 }
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -1022,4 +1043,4 @@ inline void attn_softmax_kernel<ov::float16>(ov::float16* a,
 }  // namespace XARCH
 }  // namespace Cpu
 }  // namespace Extensions
-}  // namespace ov
\ No newline at end of file
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp
index e729fac66dd257..2895a272b982b5 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp
@@ -38,32 +38,54 @@ BrgemmKernel::BrgemmKernel(size_t M,
     // blocking M
     M_blk = matmulOptimalM;
     M_tail = M % M_blk;
-    brgVnniFactor = 4 / inType.size();
 
-    if (inType != ov::element::bf16 && inType != ov::element::f32)
-        THROW_ERROR("brgemm kernel only supports bf16, f32");
+    if (!one_of(inType, ov::element::bf16, ov::element::f16, ov::element::f32))
+        THROW_ERROR("brgemm kernel only supports f16, bf16, f32");
+    bool is_f32 = inType == ov::element::f32;
+
     bool is_bf16 = inType == ov::element::bf16;
     if (is_bf16 && !mayiuse(avx512_core_bf16))
         THROW_ERROR("brgemm bf16 kernel could only be used above avx512_bf16");
 
-    bool isAMXSupported = is_bf16 && mayiuse(avx512_core_amx);
+    bool is_f16 = inType == ov::element::f16;
+    if (is_f16 && !mayiuse(avx512_core_fp16))
+        THROW_ERROR("brgemm f16 kernel could only be used above avx512_f16");
+
+    srcType = weiType = inType;
+    // If isa is avx512_core_fp16, f16 is supported by upconverted to f32
+    is_avx_f16_only = inType == ov::element::f16 && mayiuse(avx512_core_fp16) && !mayiuse(avx512_core_amx_fp16);
+    if (is_avx_f16_only) {
+        srcType = ov::element::f32;
+        weiType = ov::element::f32;
+    }
+    brgVnniFactor = 4 / weiType.size();
+
+    /*
+                AVX    AMX
+        fp32     Y      N
+        bf16     Y      Y
+        fp16     Y      Y
+    */
+    bool isAMXSupported = (is_bf16 && mayiuse(avx512_core_amx)) || (is_f16 && mayiuse(avx512_core_amx_fp16));
+    bool isBrgWithAMX = isAMXSupported && !is_avx_f16_only;
+
     size_t vlen;
     if (mayiuse(avx512_core))
         vlen = cpu_isa_traits<avx512_core>::vlen;
     else
         vlen = cpu_isa_traits<cpu_isa_t::avx2>::vlen;
     // blocking N
-    N_blk = is_bf16 ? 32 : std::max(N, vlen / inType.size());
+    N_blk = !is_f32 ? 32 : std::max(N, vlen / inType.size());
     N_tail = N % N_blk;
 
     // blocking K
-    K_blk = isAMXSupported ? 32 : K;
+    K_blk = isBrgWithAMX ? 32 : K;
     K_tail = K % K_blk;
-    if (isAMXSupported && K_tail) {
+    if (isBrgWithAMX && K_tail) {
         K_tail = rnd_up(K_tail, 2);
     }
     // copied K must be round up by vlen / inType.size(), otherwise copy B kernel may access wrong memory
-    packedBSize = rnd_up(K, vlen / inType.size()) * rnd_up(N, N_blk) * inType.size();
+    packedBSize = rnd_up(K, vlen / weiType.size()) * rnd_up(N, N_blk) * weiType.size();
     size_t brg0BaseIdx = std::numeric_limits<size_t>::max();
     for (size_t m = 0; m < 2; m++) {
         for (size_t k = 0; k < 2; k++) {
@@ -78,18 +100,18 @@ BrgemmKernel::BrgemmKernel(size_t M,
                 brgemmCtx.M = M_;
                 brgemmCtx.N = N_;
                 brgemmCtx.K = K_;
-                brgemmCtx.LDA = k ? K_blk : lda;
-                brgemmCtx.LDB = (is_bf16 || b_transposed) ? rnd_up(N, N_blk) : ldb;  // bf16/b_transposed needs copy
+                brgemmCtx.LDA = k ? K_blk : (is_avx_f16_only ? K : lda); // f16 use f32 internally
+                brgemmCtx.LDB = (!is_f32 || b_transposed) ? rnd_up(N, N_blk) : ldb;  // bf16/fp16/b_transposed needs copy
                 brgemmCtx.LDC = ldc;
-                brgemmCtx.dt_in0 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::ElementTypeToDataType(inType));
-                brgemmCtx.dt_in1 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::ElementTypeToDataType(inType));
+                brgemmCtx.dt_in0 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::ElementTypeToDataType(srcType));
+                brgemmCtx.dt_in1 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::ElementTypeToDataType(weiType));
                 brgemmCtx.beta = beta;
 
                 // don't create brgemm kernels for empty tiles
                 if (M_ != 0 && K_ != 0 && N_ != 0) {
                     if (brg0BaseIdx == std::numeric_limits<size_t>::max())
                         brg0BaseIdx = getBrgIdx(m, k, n);
-                    init_brgemm(brgemmCtx, brgKernels[getBrgIdx(m, k, n)], isAMXSupported);
+                    init_brgemm(brgemmCtx, brgKernels[getBrgIdx(m, k, n)], isBrgWithAMX);
                 }
             }
         }
@@ -97,12 +119,19 @@ BrgemmKernel::BrgemmKernel(size_t M,
 
     auto& brgemmCtx0 = brgCtxs[brg0BaseIdx];
 
-    if (brgemmCtx0.is_with_amx && K_tail) {
-        init_brgemm_copy_a(brgCopyAKernel, K, K_blk, K_tail, K_blk, brgemmCtx0.dt_in0, false, lda * inType.size());
-        packedASize = M_blk * rnd_up(K, K_blk) * inType.size();
+    if ((brgemmCtx0.is_with_amx && K_tail) || is_avx_f16_only) {
+        init_brgemm_copy_a(brgCopyAKernel,
+                           K,
+                           K_blk,
+                           K_tail,
+                           is_avx_f16_only ? K : K_blk,
+                           brgemmCtx0.dt_in0,
+                           false,
+                           lda * inType.size());
+        packedASize = M_blk * rnd_up(K, brgemmCtx0.LDA) * srcType.size();
     }
 
-    if (brgemmCtx0.is_with_amx || inType == ov::element::bf16 || b_transposed) {
+    if (brgemmCtx0.is_with_amx || !is_f32 || b_transposed) {
         size_t b_stride = 0;
         b_stride = ldb * inType.size();
         // K should use the original K
@@ -136,10 +165,20 @@ void BrgemmKernel::init_brgemm(brgemmCtx& ctx,
     const bool is_int8 =
         one_of(ctx.dt_in0, data_type::u8, data_type::s8) && one_of(ctx.dt_in1, data_type::u8, data_type::s8);
     cpu_isa_t isa;
-    if (mayiuse(avx512_core)) {
-        isa = use_amx ? isa_undef
-                : ctx.dt_in0 == dnnl_data_type_t::dnnl_bf16 ? avx512_core_bf16
-                                                            : (is_int8 ? avx512_core_vnni : avx512_core);
+    if (use_amx) {
+        isa = isa_undef;
+    } else if (mayiuse(avx512_core)) {
+        if (ctx.dt_in0 == dnnl_data_type_t::dnnl_bf16 && mayiuse(avx512_core_bf16)) {
+            isa = avx512_core_bf16;
+        } else if (ctx.dt_in0 == dnnl_data_type_t::dnnl_f16 && mayiuse(avx512_core_fp16)) {
+            isa = avx512_core_fp16;
+        } else {
+            if (is_int8) {
+                isa = avx512_core_vnni;
+            } else {
+                isa = avx512_core;
+            }
+        }
     } else {
         isa = cpu_isa_t::avx2;
     }
@@ -161,7 +200,7 @@ void BrgemmKernel::init_brgemm(brgemmCtx& ctx,
                                    ctx.K,
                                    nullptr);
     if (status != dnnl_success) {
-        THROW_ERROR("cannot be executed due to invalid brgconv params");
+        THROW_ERROR("cannot be executed due to invalid brgemm params");
     }
 
     if (use_amx && b_accumulate) {
@@ -193,6 +232,7 @@ void BrgemmKernel::init_brgemm(brgemmCtx& ctx,
     }
     brgKernel.reset(brgKernel_);
 }
+
 void BrgemmKernel::init_brgemm_copy_a(
     std::unique_ptr<dnnl::impl::cpu::x64::matmul::jit_brgemm_matmul_copy_a_t>& brgCopyKernel,
     size_t K,
@@ -214,13 +254,15 @@ void BrgemmKernel::init_brgemm_copy_a(
     brgCopyKernelConf.s8s8_compensation_required = false;
     brgCopyKernelConf.wei_zp_type = dnnl::impl::cpu::x64::none;
     brgCopyKernelConf.src_zp_type = dnnl::impl::cpu::x64::none;
-    brgCopyKernelConf.src_dt = dt_in0;
+    brgCopyKernelConf.src_dt = is_avx_f16_only ? dnnl_data_type_t::dnnl_f32 : dt_in0;
     brgCopyKernelConf.copy_A_src_stride = copy_A_src_stride;
-    brgCopyKernelConf.a_dt_sz = DnnlExtensionUtils::sizeOfDataType(static_cast<dnnl::memory::data_type>(dt_in0));
+    // copy_a_kernel assumes that in/out tensor has same data type except f16
+    // copy_a_kernel has special path for f16: assuming input(f16) -> output(f32)
+    brgCopyKernelConf.a_dt_sz = is_avx_f16_only ? sizeof(ov::float16) : DnnlExtensionUtils::sizeOfDataType(static_cast<dnnl::memory::data_type>(dt_in0));
     // copied A has the same precision of original
-    brgCopyKernelConf.tr_a_dt_sz = DnnlExtensionUtils::sizeOfDataType(static_cast<dnnl::memory::data_type>(dt_in0));
+    brgCopyKernelConf.tr_a_dt_sz = is_avx_f16_only ? sizeof(float) : DnnlExtensionUtils::sizeOfDataType(static_cast<dnnl::memory::data_type>(dt_in0));
     brgCopyKernelConf.transposed_A = transpose;
-    brgCopyKernelConf.isa = avx512_core_amx;
+    brgCopyKernelConf.isa = is_avx_f16_only ? avx512_core_fp16 : avx512_core_amx;
 
     create_brgemm_matmul_copy_a(brgCopyKernel, &brgCopyKernelConf);
 }
@@ -238,8 +280,8 @@ void BrgemmKernel::init_brgemm_copy_b(
     bool transpose,
     size_t copy_B_wei_stride) {
     brgemm_matmul_conf_t brgCopyKernelConf;
-    brgCopyKernelConf.src_dt = dt_in0;
-    brgCopyKernelConf.wei_dt = dt_in1;
+    brgCopyKernelConf.src_dt = is_avx_f16_only ? dnnl_data_type_t::dnnl_f32 : dt_in0;
+    brgCopyKernelConf.wei_dt = is_avx_f16_only ? dnnl_data_type_t::dnnl_f32 : dt_in1;
     brgCopyKernelConf.orig_wei_dt = dt_in1;
     brgCopyKernelConf.wei_n_blk = N_blk;
     brgCopyKernelConf.wei_tag =  transpose ? dnnl_ba : dnnl_ab;
@@ -255,17 +297,23 @@ void BrgemmKernel::init_brgemm_copy_b(
     brgCopyKernelConf.K_blk = K;
     brgCopyKernelConf.K_tail = 0;
     brgCopyKernelConf.N_chunk_elems = brgCopyKernelConf.N_blk;
-    brgCopyKernelConf.b_dt_sz =
+    // f16 is computed by upconverting. in(f16) -> out(f32)
+    brgCopyKernelConf.b_dt_sz = is_avx_f16_only ? sizeof(ov::float16) :
         DnnlExtensionUtils::sizeOfDataType(static_cast<dnnl::memory::data_type>(brgCopyKernelConf.src_dt));
-    brgCopyKernelConf.tr_b_dt_sz =
+    brgCopyKernelConf.tr_b_dt_sz = is_avx_f16_only ?  sizeof(float) :
         DnnlExtensionUtils::sizeOfDataType(static_cast<dnnl::memory::data_type>(brgCopyKernelConf.src_dt));
     brgCopyKernelConf.req_wei_vnni_downconvert = false;
 
     if (is_with_amx) {
-        brgCopyKernelConf.isa = avx512_core_amx;
+        brgCopyKernelConf.isa = dt_in0 == dnnl_data_type_t::dnnl_f16 ? avx512_core_amx_fp16 : avx512_core_amx;
         brgCopyKernelConf.s8s8_compensation_required = false;
     } else {
-        brgCopyKernelConf.isa = dt_in0 == dnnl_data_type_t::dnnl_bf16 ? avx512_core_bf16 : avx512_core_vnni;
+        if (inType == ov::element::f16) {
+            brgCopyKernelConf.isa = mayiuse(avx512_core_fp16) ? avx512_core_fp16 : avx2_vnni_2;
+        } else {
+            brgCopyKernelConf.isa = dt_in0 == dnnl_data_type_t::dnnl_bf16 ? avx512_core_bf16 : avx512_core_vnni;
+        }
+        brgCopyKernelConf.s8s8_compensation_required = false;
     }
 
     brgCopyKernelConf.has_zero_point_a = false;
@@ -283,7 +331,7 @@ void BrgemmKernel::copy_buffer_b(void* b, void* scratch_b) {
         for (size_t nb = 0; nb < div_up(N, N_blk); nb++) {
             auto N_stride = b_transposed ? ldb : 1;
             auto pCopyKernel0In = ptr_b + nb * N_blk * inType.size() * N_stride;
-            auto pCopyKernel0Out = ptr_scartch_b + nb * N_blk * brgVnniFactor * inType.size();
+            auto pCopyKernel0Out = ptr_scartch_b + nb * N_blk * brgVnniFactor * weiType.size();
 
             auto ctx = jit_brgemm_matmul_copy_b_t::ctx_t();
 
@@ -306,15 +354,13 @@ void BrgemmKernel::executeGemm(bool is_M_tail, void* a, void* b, void* c, void*
     auto ptr_C = reinterpret_cast<uint8_t*>(c);
     auto ptr_scartch_a = reinterpret_cast<uint8_t*>(scratch_a);
     auto ptr_scartch_b = reinterpret_cast<uint8_t*>(b);
-    uint8_t* ptr_a_tail = nullptr;
 
     size_t brgIdx0 = getBrgIdx(0, 0, 0);
     // The step for matrix A over main K dimension
     size_t K0_step0 = brgCtxs[brgIdx0].K;
     auto cur_M_blk = is_M_tail ? M_tail : M_blk;
     if (brgCopyAKernel) {
-        // only copy tailed data;
-        size_t K_offset = K < K_blk ? 0 : K0_step0 * inType.size();
+        size_t K_offset = is_avx_f16_only ? 0 : (K < K_blk ? 0 : K0_step0 * srcType.size());
         auto pCopyKernelIn = ptr_A + K_offset;
         auto pCopyKernelOut = ptr_scartch_a;
 
@@ -331,8 +377,6 @@ void BrgemmKernel::executeGemm(bool is_M_tail, void* a, void* b, void* c, void*
         ctx.current_K_blk = K % K_blk;
 
         (*brgCopyAKernel)(&ctx);
-
-        ptr_a_tail = pCopyKernelOut;
     }
     size_t count_N = 0;
     for (size_t n = 0; n < 2; n++) {
@@ -341,17 +385,17 @@ void BrgemmKernel::executeGemm(bool is_M_tail, void* a, void* b, void* c, void*
             size_t mIdx = is_M_tail ? 1 : 0;
             auto& brgemmCtx = brgCtxs[getBrgIdx(mIdx, k, n)];
             if (brgemmCtx.K != 0 && brgemmCtx.N != 0 && brgemmCtx.M != 0) {
-                auto local_a_ptr = k > 0 ? ptr_a_tail : ptr_A;
-                auto B_stride = (k * count_K + n * count_N * brgVnniFactor) * inType.size();
+                auto local_a_ptr = is_avx_f16_only ? ptr_scartch_a : (k > 0 ? ptr_scartch_a : ptr_A);
+                auto B_stride = (k * count_K + n * count_N * brgVnniFactor) * weiType.size();
                 auto weight_ptr = ptr_scartch_b + B_stride;
                 auto C_stride = n * count_N * ov::element::f32.size();
                 auto out_ptr = ptr_C + C_stride;
                 callBrgemm(brgemmCtx,
-                        brgKernels[getBrgIdx(mIdx, k, n)],
-                        local_a_ptr,
-                        weight_ptr,
-                        out_ptr,
-                        wsp);
+                           brgKernels[getBrgIdx(mIdx, k, n)],
+                           local_a_ptr,
+                           weight_ptr,
+                           out_ptr,
+                           wsp);
                 // stride K, N if body kernel is executed.
                 if (k == 0) {
                     count_K = brgemmCtx.K * brgemmCtx.LDB;
@@ -373,17 +417,17 @@ void BrgemmKernel::executeGemm(void* a, void* b, void* c, void* wsp, void* scrat
 
     for (size_t mb = 0; mb < div_up(M, M_blk); mb++) {
         const bool is_M_tail = (M - mb * M_blk < M_blk);
-        auto ptr_a = ptr_A + (mb * M_blk * lda) * inType.size();
+        auto ptr_a = ptr_A + (mb * M_blk * lda) * srcType.size();
         auto ptr_c = ptr_C + (mb * M_blk * ldc) * ov::element::f32.size();
         executeGemm(is_M_tail, ptr_a, scratch_b, wsp, ptr_c, scratch_a);
     }
 }
 void BrgemmKernel::callBrgemm(brgemmCtx& ctx,
-                                std::unique_ptr<dnnl::impl::cpu::x64::brgemm_kernel_t>& brgKernel,
-                                const void* pin0,
-                                const void* pin1,
-                                void* pout,
-                                void* wsp) {
+                              std::unique_ptr<dnnl::impl::cpu::x64::brgemm_kernel_t>& brgKernel,
+                              const void* pin0,
+                              const void* pin1,
+                              void* pout,
+                              void* wsp) {
     if (ctx.is_with_amx)
         amx_tile_configure(ctx.palette);
     if (ctx.is_with_comp) {
@@ -398,4 +442,4 @@ void BrgemmKernel::callBrgemm(brgemmCtx& ctx,
 }
 
 }  // namespace intel_cpu
-}  // namespace ov
\ No newline at end of file
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.hpp
index 513b484ab0b963..38384f2aceae83 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.hpp
@@ -59,6 +59,9 @@ class BrgemmKernel {
     size_t packedBSize = 0;
     size_t packedASize = 0;
     ov::element::Type inType;
+    ov::element::Type weiType;
+    ov::element::Type srcType;
+    bool is_avx_f16_only = false;
     bool b_accumulate = false;
     static constexpr size_t MHA_BRGEMM_KERNELS_NUM = 8;
     static constexpr size_t matmulOptimalM = 32;
diff --git a/src/plugins/intel_cpu/src/nodes/matmul.cpp b/src/plugins/intel_cpu/src/nodes/matmul.cpp
index 50cb3353612996..92d8f356728ed9 100644
--- a/src/plugins/intel_cpu/src/nodes/matmul.cpp
+++ b/src/plugins/intel_cpu/src/nodes/matmul.cpp
@@ -543,6 +543,20 @@ void MatMul::prepareParams() {
     if (!src0MemPtr || !src0MemPtr->isDefined() || !src1MemPtr || !src1MemPtr->isDefined())
         OPENVINO_THROW(errorPrefix, " has undefined input memory");
 
+    // check for a degenerate case. In this context the degenerate case is a matrix multiplication where the
+    // collapsing dimension is zero, e.g., AB=C, where A has the shape [10, 0] and B has the shape [0, 20],
+    // consequently C has shape [10, 20]. In this scenario C is a null matrix (a matrix filled with zeroes)
+    // according to the empty sum convention.
+    if (src0MemPtr->getDesc().getShape().hasZeroDims() && src0MemPtr->getDesc().getShape().hasZeroDims() &&
+        !dstMemPtr->getDesc().getShape().hasZeroDims()) {
+        // todo: obviously we need a special executor that would process fused ops providing a correct result
+        OPENVINO_ASSERT(!withBiases && fusedWith.empty(),
+                        "Matmul doesn't support a degenerate case when other ops are fused");
+        //reset executor
+        execPtr.reset();
+        return;
+    }
+
     const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor();
     if (selected_pd == nullptr)
         OPENVINO_THROW(errorPrefix, " did not set preferable primitive descriptor");
@@ -646,6 +660,9 @@ void MatMul::prepareParams() {
 void MatMul::execute(dnnl::stream strm) {
     if (execPtr) {
         execPtr->exec(primArgs, strm);
+    } else if (hasEmptyInputTensors()) {
+        // this is a degenerate case, fill output with zeroes
+        getDstMemoryAtPort(0)->nullify();
     } else {
         OPENVINO_THROW(errorPrefix, " doesn't have an initialized executor");
     }
@@ -691,6 +708,10 @@ const std::vector<impl_desc_type>& MatMul::getDefaultImplPriority() {
     return priorities;
 }
 
+bool MatMul::isExecutable() const {
+    return !hasEmptyOutputTensors();
+}
+
 }   // namespace node
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/matmul.h b/src/plugins/intel_cpu/src/nodes/matmul.h
index 7b8f064e17260b..2e487148d0ec0c 100644
--- a/src/plugins/intel_cpu/src/nodes/matmul.h
+++ b/src/plugins/intel_cpu/src/nodes/matmul.h
@@ -43,6 +43,8 @@ class MatMul : public Node {
     const std::vector<impl_desc_type>& getDefaultImplPriority() override;
     bool canBeExecutedInInt8() const override;
 
+    bool isExecutable() const override;
+
 protected:
     AttrPtr initPrimitiveAttr() override;
     AttrPtr initPrimitiveAttr(const VectorDims& dims);
diff --git a/src/plugins/intel_cpu/src/nodes/memory.cpp b/src/plugins/intel_cpu/src/nodes/memory.cpp
index e66b148c6f99ee..88693ebfa49fdf 100644
--- a/src/plugins/intel_cpu/src/nodes/memory.cpp
+++ b/src/plugins/intel_cpu/src/nodes/memory.cpp
@@ -300,21 +300,27 @@ void MemoryOutput::runStatic(dnnl::stream strm)  {
 void MemoryOutput::runDynamic(dnnl::stream strm) {
     //first we have to resize the output memory
     auto inputMem = getSrcMemoryAtPort(0);
-    const auto& newDims = inputMem->getStaticDims();
-    OPENVINO_ASSERT(extMemDesc,
-        "MemoryOutput ",
-        getName(),
-        " uninitialized assigned memory");
-
-    auto newExternDesc = extMemDesc->cloneWithNewDims(newDims);
 
     OPENVINO_ASSERT(assignedMem,
         "MemoryOutput ",
         getName(),
         " uninitialized assigned memory");
-    assignedMem->redefineDesc(newExternDesc);
 
-    runStatic(strm);
+    const auto& newShape = inputMem->getShape();
+    const auto& stateShape = assignedMem->getShape();
+
+    if (stateShape.isDynamic() || stateShape.getStaticDims() != newShape.getStaticDims()) {
+        OPENVINO_ASSERT(extMemDesc,
+            "MemoryOutput ",
+            getName(),
+            " uninitialized assigned memory");
+        auto newExternDesc = extMemDesc->cloneWithNewDims(newShape.getStaticDims());
+        assignedMem->redefineDesc(newExternDesc);
+    }
+
+    if (!newShape.hasZeroDims()) { // no need to copy data for empty tensor
+        runStatic(strm);
+    }
 }
 
 bool MemoryOutputStub::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
@@ -413,7 +419,7 @@ MemoryInputBase::~MemoryInputBase() {
 }
 
 MemoryOutputBase& MemoryInputBase::getOutputNode() {
-    OPENVINO_ASSERT(outputNode, "MemoryOutput ", getName(), " doesn't have sibling input");
+    OPENVINO_ASSERT(outputNode, "MemoryInput ", getName(), " doesn't have sibling output");
     return *outputNode;
 }
 
@@ -593,31 +599,44 @@ void MemoryInput::runDynamic(dnnl::stream strm) {
         getName(),
         " assigned state has null memory ptr");
 
-    // check whether we can share memory block
-    const auto& stateDims = assignedMem->getStaticDims();
-    const bool hasZeroDims = std::count(std::begin(stateDims), std::end(stateDims), 0) > 0;
-    auto internDesc = getBaseMemDescAtOutputPort(0)->cloneWithNewDims(stateDims, hasZeroDims);
-
     OPENVINO_ASSERT(memBlock,
         "MemoryInput ",
         getName(),
         " has uninitialized memory block.");
 
+    // check whether we can share memory block
+    const auto& shape = assignedMem->getShape();
+    const bool hasZeroDims = shape.hasZeroDims();
+    const bool processInitGraph = needInitGraphProcessing();
+    const auto& stateDims = shape.getStaticDims();
+
+    if (hasZeroDims && !processInitGraph) {
+        // fast track as we don't really need to share memory and transfer any data for empty tensors
+        memBlock->reset();
+        redefineOutputMemory(0, stateDims);
+        return;
+    }
+
+    auto dst = getDstMemoryAtPort(0);
+    auto currentOutputDesc = dst->getDescPtr();
+
+    auto internDesc = currentOutputDesc->isDefined() && (currentOutputDesc->getShape().getStaticDims() == stateDims)
+                          ? currentOutputDesc
+                          : getBaseMemDescAtOutputPort(0)->cloneWithNewDims(stateDims, hasZeroDims);
+
     if (internDesc->isCompatible(assignedMem->getDesc())) {
         memBlock->setMemBlock(assignedMem->getMemoryBlock());
     } else {
         memBlock->reset();
     }
 
-    const bool processInitGraph = needInitGraphProcessing();
     //reshape output
     const auto& newDims = processInitGraph ? getSrcMemoryAtPort(0)->getStaticDims() : stateDims;
 
-    redefineOutputMemory({newDims});
+    redefineOutputMemory(0, newDims);
 
     //copy data when necessary
     auto src = processInitGraph ? getSrcMemoryAtPort(0) : assignedMem;
-    auto dst = getDstMemoryAtPort(0);
     if (src->getData() != dst->getData()) {
         dst->load(*src);
     }
@@ -847,6 +866,6 @@ void MemoryInputSDPA::resolveInPlaceEdges(Edge::LOOK look) {
     }
 }
 
-}   // namespace node
+}  // namespace node
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
index 2272fa481d5471..6bf7d3099a85d9 100644
--- a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
@@ -190,6 +190,8 @@ ov::element::Type PagedAttention::getRuntimePrecision() const {
     // bf16 should be enabled only when platform supports
     if (rtPrecision == ov::element::bf16 && ov::with_cpu_x86_bfloat16()) {
         rtPrecision = ov::element::bf16;
+    } else if (rtPrecision == ov::element::f16 && ov::with_cpu_x86_avx512_core_fp16()) {
+        rtPrecision = ov::element::f16;
     } else {
         rtPrecision = ov::element::f32;
     }
diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
index 016fa90398aa4b..eecba2acff260b 100644
--- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
@@ -261,7 +261,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
 
     void prepare_brgemm_prim(dnnl::stream strm, PlainTensor& query, PlainTensor& present_key, bool has_out_transpose) {
         auto in_type = precision_of<T>::value;
-        auto qkv_dt = in_type == ov::element::f32 ? dt::f32 : dt::bf16;
+        auto qkv_dt = DnnlExtensionUtils::ElementTypeToDataType(in_type);
         auto B = query.size(0);
         auto H = query.size(1);
         auto q_len = query.size(2);
@@ -354,13 +354,13 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
         size_t h_each_group_len = H / Hk;
         const size_t m_block_size = qk_gemm_ptr->get_mblk_size();
         auto m_blocks = (q_len + m_block_size - 1) / m_block_size;
-        bool is_bf16 = precision_of<T>::value == ov::element::bf16;
+        bool is_xf16 = precision_of<T>::value == ov::element::bf16 || precision_of<T>::value == ov::element::f16;
         // packed k, v
         parallel_for2d(B, Hk, [&](size_t b, size_t h) {
             T* k_ptr = &present_key.at<T>({b, h, 0, 0});
             T* v_ptr = &present_value.at<T>({b, h, 0, 0});
             qk_gemm_ptr->copy_buffer_b(k_ptr, &qk_scratch_b.at<T>({b, h, 0}));
-            if (is_bf16)
+            if (is_xf16)
                 wv_gemm_ptr->copy_buffer_b(v_ptr, &wv_scratch_b.at<T>({b, h, 0}));
         });
 
@@ -420,12 +420,12 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
             }
             auto* w_ptr = reinterpret_cast<T*>(weight_score.ptr<float>(ithr, h, 0, 0));
             float* fp32_out_ptr;
-            if (is_bf16) {
+            if (is_xf16) {
                 fp32_out_ptr = has_out_transpose ? &fp32_out.at<float>({b, m_start, h, 0}) : &fp32_out.at<float>({b, h, m_start, 0});
             } else {
                 fp32_out_ptr = has_out_transpose ? &output_emb.at<float>({b, m_start, h * head_size}) : &output_emb.at<float>({b, h, m_start, 0});
             }
-            T* v_ptr = is_bf16 ? &wv_scratch_b.at<T>({b, h / h_each_group_len, 0})
+            T* v_ptr = is_xf16 ? &wv_scratch_b.at<T>({b, h / h_each_group_len, 0})
                                : &present_value.at<T>({b, h / h_each_group_len, 0, 0});
             wv_gemm_ptr->executeGemm(m_cnt < m_block_size,
                                      w_ptr,
@@ -433,12 +433,12 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
                                      fp32_out_ptr,
                                      wsp.data() + tid * wsp_size_per_thread,
                                      wv_scratch_a ? &wv_scratch_a.at<T>({tid, 0}) : nullptr);
-            if (is_bf16) {
+            if (is_xf16) {
                 if (has_out_transpose) {
                     attn_memcpy2d_kernel(&fp32_out.at<float>({b, m_start, h, 0}),
                                          &output_emb.at<T>({b, m_start, h * head_size}),
                                          ov::element::f32,
-                                         ov::element::bf16,
+                                         precision_of<T>::value,
                                          fp32_out.stride(1),
                                          output_emb.stride(1),
                                          head_size,
@@ -447,7 +447,7 @@ struct MHAKernel<ScaledDotProductAttention::KT_ONEDNN, T> {
                     attn_memcpy2d_kernel(&fp32_out.at<float>({b, h, m_start, 0}),
                                          &output_emb.at<T>({b, h, m_start, 0}),
                                          ov::element::f32,
-                                         ov::element::bf16,
+                                         precision_of<T>::value,
                                          0,
                                          0,
                                          m_cnt * head_size,
@@ -1068,28 +1068,35 @@ void ScaledDotProductAttention::createPrimitive() {
 
     auto builder = [&](const ScaledDotProductAttentionKey& key) -> std::shared_ptr<Executor> {
         std::shared_ptr<Executor> executor = nullptr;
-        if (rtPrecision == ov::element::bf16) {
 #ifdef OPENVINO_ARCH_X86_64
+        if (rtPrecision == ov::element::bf16) {
             executor = std::make_shared<AttentionExecutor<KT_ONEDNN, ov::bfloat16>>(context);
-#endif
+        } else if (rtPrecision == ov::element::f16) {
+            if (with_cpu_x86_avx512_core_fp16()) {
+                executor = std::make_shared<AttentionExecutor<KT_ONEDNN, ov::float16>>(context);
+            } else {
+                executor = std::make_shared<AttentionExecutor<KT_REF, ov::float16>>(context);
+            }
         } else {
-#if defined(OV_CPU_WITH_ACL)
-            if (rtPrecision == ov::element::f16)
-                executor = std::make_shared<AttentionExecutor<KT_ACL, ov::float16>>(context);
-            else
-                executor = std::make_shared<AttentionExecutor<KT_ACL, float>>(context);
-#elif defined(OV_CPU_WITH_MLAS)
+#ifdef OV_CPU_WITH_MLAS
             executor = std::make_shared<AttentionExecutor<KT_MLAS, float>>(context);
-#elif defined(OPENVINO_ARCH_X86_64)
+#else
             if (with_cpu_x86_avx512_core()) {
                 executor = std::make_shared<AttentionExecutor<KT_ONEDNN, float>>(context);
             } else {
                 executor = std::make_shared<AttentionExecutor<KT_REF, float>>(context);
             }
-#else
-            executor = std::make_shared<AttentionExecutor<KT_REF, float>>(context);
 #endif
         }
+#elif defined(OV_CPU_WITH_ACL)
+        if (rtPrecision == ov::element::f16) {
+            executor = std::make_shared<AttentionExecutor<KT_ACL, ov::float16>>(context);
+        } else {
+            executor = std::make_shared<AttentionExecutor<KT_ACL, float>>(context);
+        }
+#else
+        executor = std::make_shared<AttentionExecutor<KT_REF, float>>(context);
+#endif
         return executor;
     };
 
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index fa1810ff6044f9..5c88772eeedabc 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -446,7 +446,7 @@ ov::Any Plugin::get_ro_property(const std::string& name, const ov::AnyMap& optio
     } else if (ov::internal::supported_properties == name) {
         return decltype(ov::internal::supported_properties)::value_type{
             ov::PropertyName{ov::internal::caching_properties.name(), ov::PropertyMutability::RO},
-#if !defined(OPENVINO_ARCH_ARM)
+#if !defined(OPENVINO_ARCH_ARM) && !(defined(__APPLE__) || defined(__MACOSX))
             ov::PropertyName{ov::internal::caching_with_mmap.name(), ov::PropertyMutability::RO},
 #endif
             ov::PropertyName{ov::internal::exclusive_async_requests.name(), ov::PropertyMutability::RW},
diff --git a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp
index e5d87c578712f6..5db6f97bba8c02 100644
--- a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp
+++ b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp
@@ -101,6 +101,7 @@
 #include "scaled_dot_product_attention_shape_inference.hpp"
 #include "scatter_elements_update_shape_inference.hpp"
 #include "scatter_nd_base_shape_inference.hpp"
+#include "search_sorted_shape_inference.hpp"
 #include "select_shape_inference.hpp"
 #include "shape_nodes.hpp"
 #include "shuffle_channels_shape_inference.hpp"
@@ -405,6 +406,7 @@ using IStaticShapeInferFactory =
 template <>
 const IStaticShapeInferFactory::TRegistry IStaticShapeInferFactory::registry{
     // opset15
+    _OV_OP_SHAPE_INFER_MASK_REG(op::v15::SearchSorted, ShapeInferTA, util::bit::mask()),
     _OV_OP_SHAPE_INFER_MASK_REG(op::v15::StringTensorUnpack, ShapeInferTA, util::bit::mask(0)),
     _OV_OP_SHAPE_INFER_MASK_REG(op::v15::StringTensorPack, ShapeInferTA, util::bit::mask(0, 1)),
     _OV_OP_SHAPE_INFER_MASK_REG(opset15::EmbeddingBagOffsets, ShapeInferTA, util::bit::mask()),
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index abf1ad8f283205..0e683482a97934 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -238,6 +238,17 @@ bool Transformations::fuse_type_to_fq(const std::shared_ptr<ov::Node>& node, con
     return true;
 }
 
+bool Transformations::fuse_type_to_pa(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions) {
+    auto pa = ov::as_type_ptr<ov::op::PagedAttentionExtension>(node);
+    if (!pa)
+        return false;
+    // PagedAttentionExtension's 2nd output type should be kept f32.
+    // The reason is that the pagedattention node in CPU plugin hardcodes 2nd output type as f32.
+    // So, set f32 to the 2nd output type, which can avoid extra data type conversion during transformation.
+    pa->set_out_type(1, ov::element::f32);
+    return true;
+}
+
 bool Transformations::fuse_type_to_convert(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions) {
     auto convert = ov::as_type_ptr<ov::opset10::Convert>(node);
     if (!convert)
@@ -391,7 +402,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
 #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
         type_to_fuse_map fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}};
 #else
-        type_to_fuse_map fuse_map = {};
+        type_to_fuse_map fuse_map = {{ov::op::PagedAttentionExtension::get_type_info_static(), fuse_type_to_pa}};
 #endif
         const bool keep_precision_sensitive_in_fp32 = true;
         CPU_REGISTER_PASS_COMMON(manager,
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h
index 0b6a437f667747..33c26ab8aea9e4 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h
@@ -48,6 +48,7 @@ class Transformations {
 
     static bool fuse_type_to_convert(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
     static bool fuse_type_to_fq(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
+    static bool fuse_type_to_pa(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
 };
 
 }   // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/utils/serialize.cpp b/src/plugins/intel_cpu/src/utils/serialize.cpp
index 6666d42fb4f586..f7fd337afa932e 100644
--- a/src/plugins/intel_cpu/src/utils/serialize.cpp
+++ b/src/plugins/intel_cpu/src/utils/serialize.cpp
@@ -58,19 +58,20 @@ void ModelDeserializer::set_info(pugi::xml_node& root, std::shared_ptr<ov::Model
 }
 
 void ModelDeserializer::operator>>(std::shared_ptr<ov::Model>& model) {
-    if (auto mmap_stream = dynamic_cast<MmapStream*>(&m_istream)) {
-        process_mmap(model, mmap_stream->m_memory);
+    if (auto mmap_buffer = dynamic_cast<OwningSharedStreamBuffer*>(m_istream.rdbuf())) {
+        auto buffer = mmap_buffer->get_buffer();
+        process_mmap(model, buffer);
     } else {
         process_stream(model);
     }
 }
 
 void ModelDeserializer::process_mmap(std::shared_ptr<ov::Model>& model,
-                                     const std::shared_ptr<ov::MappedMemory>& mmemory) {
+                                     const std::shared_ptr<ov::AlignedBuffer>& mmemory) {
     // Note: Don't use seekg with mmaped stream. This may affect the performance of some models.
     // Get file size before seek content.
     // Blob from cache may have other header, so need to skip this.
-    auto buffer_base = mmemory->data();
+    auto buffer_base = reinterpret_cast<char*>(mmemory->get_ptr());
     const auto file_size = mmemory->size();
     const size_t hdr_pos = m_istream.tellg();
 
@@ -98,9 +99,7 @@ void ModelDeserializer::process_mmap(std::shared_ptr<ov::Model>& model,
     // Map blob content
     std::shared_ptr<ov::AlignedBuffer> weights_buf;
     if (hdr.consts_size) {
-        weights_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<MappedMemory>>>(buffer_base + hdr.consts_offset,
-                                                                                        hdr.consts_size,
-                                                                                        mmemory);
+        weights_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::AlignedBuffer>>>(buffer_base + hdr.consts_offset, hdr.consts_size, mmemory);
     }
 
     // XML content
diff --git a/src/plugins/intel_cpu/src/utils/serialize.hpp b/src/plugins/intel_cpu/src/utils/serialize.hpp
index 817041452c9597..897a2c2e52f092 100644
--- a/src/plugins/intel_cpu/src/utils/serialize.hpp
+++ b/src/plugins/intel_cpu/src/utils/serialize.hpp
@@ -40,7 +40,7 @@ class ModelDeserializer {
 protected:
     static void set_info(pugi::xml_node& root, std::shared_ptr<ov::Model>& model);
 
-    void process_mmap(std::shared_ptr<ov::Model>& model, const std::shared_ptr<ov::MappedMemory>& memory);
+    void process_mmap(std::shared_ptr<ov::Model>& model, const std::shared_ptr<ov::AlignedBuffer>& memory);
 
     void process_stream(std::shared_ptr<ov::Model>& model);
 
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/activation.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/activation.cpp
index 0f25351a020f60..307938fbfec17a 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/activation.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/activation.cpp
@@ -193,6 +193,7 @@ std::string ActivationLayerCPUTest::getPrimitiveType(const utils::ActivationType
         (activation_type == utils::ActivationTypes::Relu) ||
         (activation_type == utils::ActivationTypes::Sigmoid) ||
         (activation_type == utils::ActivationTypes::SoftSign) ||
+        (activation_type == utils::ActivationTypes::Sqrt) ||
         (activation_type == utils::ActivationTypes::Swish) ||
         (activation_type == utils::ActivationTypes::LogicalNot) ||
         (activation_type == utils::ActivationTypes::Tanh))) {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/matmul.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/matmul.cpp
index 934a0f4bc95f18..9b5d7287875d7c 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/matmul.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/matmul.cpp
@@ -35,8 +35,8 @@ const std::vector<ShapeRelatedParams> IS = {
 const std::vector<ShapeRelatedParams> IS_Dynamic = {
     {
         { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
-            {{-1, -1}, {{55, 12}, {33, 7}}}, // input 0
-            {{-1, -1}, {{12, 55}, {7, 33}}}  // input 1
+            {{-1, -1}, {{55, 12}, {33, 7}, {33, 0}, {0, 33}}}, // input 0
+            {{-1, -1}, {{12, 55}, {7, 33}, {0, 33}, {33, 0}}}  // input 1
         },
         {false, false}
     },
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp
index eb6fdc2a6bfc3f..8a9212f8998f94 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp
@@ -37,6 +37,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest,
         ConcatSDPTest,
         ::testing::Combine(::testing::Values(ElementType::f16),
                            ::testing::ValuesIn(inputShapes),
+                           ::testing::Values(false),
                            ::testing::Values(true, false)),
         ConcatSDPTest::getTestCaseName);
 }  // namespace
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp
index f4abaa03b7c28b..f5a7bfacfac99f 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp
@@ -28,8 +28,9 @@ namespace test {
 std::string ConcatSDPTest::getTestCaseName(const testing::TestParamInfo<ConcatSDPTestParams>& obj) {
     ElementType inType;
     std::vector<InputShape> inputShapes;
-    bool hasShapeof;
-    std::tie(inType, inputShapes, hasShapeof) = obj.param;
+    bool forceKVU8;
+    bool hasShapeOf;
+    std::tie(inType, inputShapes, forceKVU8, hasShapeOf) = obj.param;
     std::ostringstream result;
     result << "IS=";
     for (const auto& shape : inputShapes) {
@@ -46,21 +47,24 @@ std::string ConcatSDPTest::getTestCaseName(const testing::TestParamInfo<ConcatSD
         result << ")_";
     }
     result << "Prc=" << inType << "_";
-    result << "HasShapeOf=" << hasShapeof;
+    result << "ForceKVU8=" << forceKVU8 << "_";
+    result << "HasShapeOf=" << hasShapeOf;
     return result.str();
 }
 
 void ConcatSDPTest::SetUp() {
     ElementType inType;
     std::vector<InputShape> inputShapes;
-    std::tie(inType, inputShapes, hasShapeOf) = this->GetParam();
+    std::tie(inType, inputShapes, m_forceKVU8, m_hasShapeOf) = this->GetParam();
     targetDevice = ov::test::utils::DEVICE_CPU;
     rel_threshold = 1e-2f;
-    if (inType == ElementType::bf16) {
-        configuration.insert({"ENFORCE_BF16", "YES"});
-        rel_threshold = 0.01f;
-    } else if (inType == ElementType::f16) {
+    if (inType == ElementType::bf16 || inType == ElementType::f16) {
         configuration.insert({"INFERENCE_PRECISION_HINT", ov::element::Type(inType).get_type_name()});
+        rel_threshold = 0.01f;
+    }
+
+    if (m_forceKVU8) {
+        configuration["KV_CACHE_PRECISION"] = "u8";
     }
     init_input_shapes(inputShapes);
     ov::ParameterVector inputParams;
@@ -92,7 +96,7 @@ void ConcatSDPTest::SetUp() {
     //              |
     //            ShapeOf...
     // The transformation 'SimplifyGatherShapeOf' will move ShapeOf to be the child of ReadValue
-    if (hasShapeOf) {
+    if (m_hasShapeOf) {
         shapeof_k = std::make_shared<ov::op::v0::ShapeOf>(gatherK);
         shapeof_v = std::make_shared<ov::op::v0::ShapeOf>(gatherV);
     }
@@ -107,20 +111,20 @@ void ConcatSDPTest::SetUp() {
     pastv_assign->set_friendly_name("pastv_w");
 
     ResultVector results{std::make_shared<ov::op::v0::Result>(add)};
-    if (hasShapeOf) {
+    if (m_hasShapeOf) {
         results.push_back(std::make_shared<ov::op::v0::Result>(shapeof_k));
         results.push_back(std::make_shared<ov::op::v0::Result>(shapeof_v));
     }
     SinkVector sinks{pastk_assign, pastv_assign};
     function = std::make_shared<ov::Model>(results, sinks, inputParams, "ConcatSDP");
     targetDevice = ov::test::utils::DEVICE_CPU;
-
     functionRefs = function->clone();
     pass::Manager manager;
     // decompose ScaledDotProductAttention
     manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
     manager.run_passes(functionRefs);
 }
+
 void ConcatSDPTest::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
     std::vector<ov::Shape> shapes(4);
     shapes[0] = targetInputStaticShapes[0];
@@ -129,6 +133,7 @@ void ConcatSDPTest::generate_inputs(const std::vector<ov::Shape>& targetInputSta
     shapes[3] = targetInputStaticShapes[1];
     SubgraphBaseTest::generate_inputs(shapes);
 }
+
 template<typename IT, typename T>
 void strided_iota(IT first, size_t n, T value, T stride) {
     for (size_t i = 0; i < n; i++) {
@@ -136,6 +141,7 @@ void strided_iota(IT first, size_t n, T value, T stride) {
         value += stride;
     }
 }
+
 void ConcatSDPTest::generate(int idx, const std::vector<ov::Shape>& targetInputStaticShapes) {
     inputs.clear();
     auto create_input = [this] (std::shared_ptr<op::v0::Parameter> param, ov::Shape shape, float val) {
@@ -169,16 +175,19 @@ void ConcatSDPTest::generate(int idx, const std::vector<ov::Shape>& targetInputS
     create_input(function->get_parameters()[3], targetInputStaticShapes[1], idx + 4.0f);
     create_input(function->get_parameters()[4], ov::Shape{targetInputStaticShapes[0][0]}, idx + 0.0f);
 }
+
 void ConcatSDPTest::prepare() {
     compile_model();
     inferRequest = compiledModel.create_infer_request();
     ASSERT_TRUE(inferRequest);
 }
+
 void ConcatSDPTest::reset() {
     for (auto&& state : inferRequest.query_state()) {
         state.reset();
     }
 }
+
 std::vector<ov::Tensor> ConcatSDPTest::run_test(std::shared_ptr<ov::Model> model) {
     function = model;
     prepare();
@@ -201,6 +210,12 @@ std::vector<ov::Tensor> ConcatSDPTest::run_test(std::shared_ptr<ov::Model> model
 }
 TEST_P(ConcatSDPTest, CompareWithRefs) {
     SKIP_IF_CURRENT_TEST_IS_DISABLED();
+    ElementType inType;
+    std::vector<InputShape> inputShapes;
+    bool forceKVU8;
+    bool hasShapeOf;
+    std::tie(inType, inputShapes, forceKVU8, hasShapeOf) = this->GetParam();
+
     auto actualOutputs = run_test(function);
     if (!hasShapeOf) {
         CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
@@ -216,9 +231,14 @@ TEST_P(ConcatSDPTest, CompareWithRefs) {
             }
         }
     }
+
+    // the range of our result will exceed f16 max value and there may be 'inf'. In softmax, there is a step:
+    //   v - max(v), if v is inf, the result of 'v-max(v)' will be nan
+    // use f32 as reference
     if (inType == ElementType::f16) {
         configuration["INFERENCE_PRECISION_HINT"] = "f32";
     }
+
     auto expectedOutputs = run_test(functionRefs);
     CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 0);
     for (size_t i = 0; i < actualOutputs.size(); i++) {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp
index 56fad11f53e600..ac59e48f496b3b 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp
@@ -34,7 +34,7 @@ namespace test {
 template<typename IT, typename T>
 void strided_iota(IT first, size_t n, T value, T stride);
 
-typedef std::tuple<ElementType, std::vector<InputShape>, bool> ConcatSDPTestParams;
+typedef std::tuple<ElementType, std::vector<InputShape>, bool, bool> ConcatSDPTestParams;
 
 class ConcatSDPTest :
         public testing::WithParamInterface<ConcatSDPTestParams>,
@@ -46,7 +46,8 @@ class ConcatSDPTest :
     void prepare();
     void reset();
     std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model);
-    bool hasShapeOf;
+    bool m_forceKVU8;
+    bool m_hasShapeOf;
 protected:
     void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
     void SetUp() override;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp
index bc73de76999daf..d05e7840562191 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp
@@ -18,6 +18,7 @@ namespace test {
 using InputShapeAndTransposeOrder = std::pair<std::vector<InputShape>, std::vector<size_t>>;
 using ConcatMultiQuerySDPParams = std::tuple<ElementType,
                                              InputShapeAndTransposeOrder,
+                                             bool, // force kvcache int8
                                              bool  // has ShapeOf
                                              >;
 // Subgraph:
@@ -52,8 +53,10 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface<ConcatMultiQu
     static std::string getTestCaseName(const testing::TestParamInfo<ConcatMultiQuerySDPParams>& obj) {
         ElementType qkvType;
         InputShapeAndTransposeOrder inputShapeAndOrders;
-        bool hasShapeof;
-        std::tie(qkvType, inputShapeAndOrders, hasShapeof) = obj.param;
+        bool forceKVU8;
+        bool hasShapeOf;
+        std::tie(qkvType, inputShapeAndOrders, forceKVU8, hasShapeOf) = obj.param;
+        ElementType kvCacheType = forceKVU8 ? ov::element::Type_t::u8 : qkvType;
         std::ostringstream result;
         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
         std::vector<size_t>& transposeOrder = inputShapeAndOrders.second;
@@ -71,8 +74,9 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface<ConcatMultiQu
             }
             result << ")_";
         }
-        result << "Prc=" << qkvType << "_";
-        result << "HasShapeOf=" << hasShapeof << "_";
+        result << "qDataType=" << qkvType << "_";
+        result << "kvDataType=" << kvCacheType << "_";
+        result << "HasShapeOf=" << hasShapeOf << "_";
         result << "TransposeOrder=";
         result << "(";
         for (const auto& itr : transposeOrder) {
@@ -85,18 +89,21 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface<ConcatMultiQu
 
     void SetUp() override {
         InputShapeAndTransposeOrder inputShapeAndOrders;
+        bool forceKVU8;
         bool hasShapeOf;
         ElementType qkvType;
-        std::tie(qkvType, inputShapeAndOrders, hasShapeOf) = this->GetParam();
+        std::tie(qkvType, inputShapeAndOrders, forceKVU8, hasShapeOf) = this->GetParam();
         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
         std::vector<size_t>& transposeOrder = inputShapeAndOrders.second;
         targetDevice = ov::test::utils::DEVICE_CPU;
         rel_threshold = 1e-2f;
         configuration[ov::hint::inference_precision.name()] = ov::element::f32;
-        if (qkvType == ElementType::bf16) {
-            configuration[ov::hint::inference_precision.name()] = ov::element::bf16;
+        if (qkvType == ElementType::bf16 || qkvType == ElementType::f16) {
+            configuration[ov::hint::inference_precision.name()] = ov::element::Type(qkvType).get_type_name();
             rel_threshold = 0.01f;
         }
+        if (forceKVU8)
+            configuration["KV_CACHE_PRECISION"] = "u8";
         init_input_shapes(inputShapes);
         ov::ParameterVector inputParams;
         // q,k,v
@@ -229,6 +236,10 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface<ConcatMultiQu
                 ov::Tensor t{ov::element::f32, shape};
                 strided_iota(static_cast<float*>(t.data()), t.get_size(), val, 0.1f);
                 inputs.insert({param, t});
+            } else if (param->get_element_type() == element::f16) {
+                ov::Tensor t{ov::element::f16, shape};
+                strided_iota(static_cast<ov::float16*>(t.data()), t.get_size(), val, 0.1f);
+                inputs.insert({param, t});
             } else {
                 ov::Tensor t{ov::element::bf16, shape};
                 strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
@@ -269,6 +280,10 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface<ConcatMultiQu
             outputs.push_back(copy);
         }
         auto states = inferRequest.query_state();
+        // k, v may be in any order
+        std::sort(states.begin(), states.end(), [] (VariableState& a, VariableState& b) {
+            return a.get_name() > b.get_name();
+        });
         for (std::string name : {"pastk", "pastv"}) {
             auto itr = std::find_if(states.begin(), states.end(), [&](const ov::VariableState& state) {
                 return name == state.get_name();
@@ -290,17 +305,20 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface<ConcatMultiQu
 TEST_P(ConcatMultiQuerySDPTest, CompareWithRefs) {
     SKIP_IF_CURRENT_TEST_IS_DISABLED();
     InputShapeAndTransposeOrder inputShapeAndOrders;
+    bool forceKVU8;
     bool hasShapeOf;
     ElementType qkvType;
-    std::tie(qkvType, inputShapeAndOrders, hasShapeOf) = this->GetParam();
-    if (qkvType == ElementType::bf16 && !ov::with_cpu_x86_bfloat16())
-        GTEST_SKIP();
+    std::tie(qkvType, inputShapeAndOrders, forceKVU8, hasShapeOf) = this->GetParam();
     auto actualOutputs = run_test(function);
     CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
     CheckNumberOfNodesWithType(compiledModel, "Concatenation", 0);
     CheckNumberOfNodesWithType(compiledModel, "Reorder", 0);
     CheckNumberOfNodesWithType(compiledModel, "Transpose", 1);
     CheckNumberOfNodesWithType(compiledModel, "Gather", 0);
+    // use f32 as reference
+    if (qkvType == ElementType::f16) {
+        configuration["INFERENCE_PRECISION_HINT"] = "f32";
+    }
     auto expectedOutputs = run_test(functionRefs);
     CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 0);
     for (size_t i = 0; i < actualOutputs.size(); i++) {
@@ -384,8 +402,9 @@ const std::vector<InputShapeAndTransposeOrder> inputShapeAndReorders = {{
 
 INSTANTIATE_TEST_SUITE_P(smoke_ConcatMultiQuerySDPTest,
                          ConcatMultiQuerySDPTest,
-                         ::testing::Combine(::testing::Values(ElementType::f32, ElementType::bf16),
+                         ::testing::Combine(::testing::Values(ElementType::f32, ElementType::bf16, ElementType::f16),
                                             ::testing::ValuesIn(inputShapeAndReorders),
+                                            ::testing::Values(true, false),
                                             ::testing::Values(true, false)),
                          ConcatMultiQuerySDPTest::getTestCaseName);
 }  // namespace
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp
index f9971a7fe9ce16..57927434524891 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp
@@ -37,8 +37,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest,
         ConcatSDPTest,
         ::testing::Combine(::testing::Values(ElementType::f32),
                            ::testing::ValuesIn(inputShapes),
+                           ::testing::Values(true, false),
                            ::testing::Values(true, false)),
         ConcatSDPTest::getTestCaseName);
+
 }  // namespace
 
 }  // namespace test
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp
index 839370d3a97728..65bc379c78b540 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp
@@ -253,6 +253,10 @@ class ConcatSDPTransposeTest : public ConcatSDPTransposeTestBase {
             outputs.push_back(copy);
         }
         auto states = inferRequest.query_state();
+        // k, v may be in any order
+        std::sort(states.begin(), states.end(), [] (VariableState& a, VariableState& b) {
+            return a.get_name() > b.get_name();
+        });
         for (std::string name : {"pastk", "pastv"}) {
             auto itr = std::find_if(states.begin(), states.end(), [&](const ov::VariableState& state) {
                 return name == state.get_name();
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/denormal_check.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/denormal_check.cpp
index b98d4c61a1fb43..39fe70ebd87df4 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/denormal_check.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/denormal_check.cpp
@@ -36,8 +36,8 @@ void SetUp() override {
     targetStaticShapes.push_back({inpShape});
     targetDevice = ov::test::utils::DEVICE_CPU;
 
-    const auto elemsCount = shape_size(inpShape);
     const auto rtPrc = ov::element::f32;
+    const auto elemsCount = shape_size(inpShape) * rtPrc.size();
     ov::ParameterVector params {std::make_shared<ov::op::v0::Parameter>(rtPrc, ov::Shape(inpShape))};
     pConstStorage.reset(new ov::AlignedBuffer(elemsCount, alignment));
 
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp
new file mode 100644
index 00000000000000..93c99048fec349
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "openvino/opsets/opset13.hpp"
+#include "openvino/pass/manager.hpp"
+#include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp"
+
+#include "custom/subgraph_tests/src/classes/concat_sdp.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "utils/cpu_test_utils.hpp"
+#include "common_test_utils/ov_tensor_utils.hpp"
+
+using namespace CPUTestUtils;
+
+namespace ov {
+namespace test {
+
+namespace {
+const std::vector<std::vector<InputShape>> inputShapes = {
+    // greedy search
+    {
+        // B, H, L1, S
+        {{1, 8, -1, 64}, {{1, 8, 10, 64}, {1, 8, 1, 64}, {1, 8, 1, 64}, {1, 8, 20, 64}, {1, 8, 1, 64}}},
+        // B, H, L0, S
+        {{1, 8, -1, 64}, {{1, 8, 0, 64}, {1, 8, 10, 64}, {1, 8, 11, 64}, {1, 8, 12, 64}, {1, 8, 32, 64}}},
+    },
+    // beam search
+    {
+        // B, H, L1, S
+        {{-1, 8, -1, 64}, {{4, 8, 10, 64}, {4, 8, 1, 64}, {4, 8, 1, 64}, {4, 8, 1, 64}, {4, 8, 1, 64}}},
+        // B, H, L0, S
+        {{-1, 8, -1, 64}, {{4, 8, 0, 64}, {4, 8, 10, 64}, {4, 8, 11, 64}, {4, 8, 12, 64}, {4, 8, 13, 64}}},
+    },
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest,
+        ConcatSDPTest,
+        ::testing::Combine(::testing::Values(ElementType::bf16, ElementType::f16),
+                           ::testing::ValuesIn(inputShapes),
+                           ::testing::Values(true, false),
+                           ::testing::Values(true, false)),
+        ConcatSDPTest::getTestCaseName);
+
+}  // namespace
+
+}  // namespace test
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index 2a8f49b5dcfe0e..e7c006ab97427f 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -358,6 +358,8 @@ std::vector<std::string> disabledTestPatterns() {
     retVector.emplace_back(R"(smoke_VariableState/OVInferRequestVariableStateTest.*)");
     // Issue: 141705
     retVector.emplace_back(R"(.*smoke_arm_Deconv_2D_Planar_FP16/DeconvolutionLayerCPUTest.*INFERENCE_PRECISION_HINT=f16.*)");
+    // Issue: 154882
+    retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)");
 #endif
 
 #if defined(OPENVINO_ARCH_ARM)
@@ -529,6 +531,8 @@ std::vector<std::string> disabledTestPatterns() {
     if (!ov::with_cpu_x86_avx512_core_fp16()) {
         // Skip fp16 tests for paltforms that don't support fp16 precision
         retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
+        retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)");
+        retVector.emplace_back(R"(.*ConcatSDPTest.*f16.*)");
     }
 #elif defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM)
     if (!ov::intel_cpu::hasHardwareSupport(ov::element::f16)) {
@@ -560,6 +564,7 @@ std::vector<std::string> disabledTestPatterns() {
         retVector.emplace_back(R"(.*smoke_Snippets_EnforcePrecision_bf16.*)");
         retVector.emplace_back(R"(.*smoke_Snippets_MHAWOTransposeEnforceBF16.*)");
         retVector.emplace_back(R"(.*smoke_Snippets_MHAEnforceBF16.*)");
+        retVector.emplace_back(R"(.*ConcatSDPTest.*bf16.*)");
     }
     // [150842] Need to support dynamic K dimension of BF16|INT8 MatMul on AMX systems
     if (ov::with_cpu_x86_avx512_core_amx()) {
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp
new file mode 100644
index 00000000000000..d85ced5f07a92e
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp
@@ -0,0 +1,21 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_tests/lora_pattern.hpp"
+
+using namespace ov::test;
+
+namespace {
+
+INSTANTIATE_TEST_SUITE_P(smoke,
+                         LoraPatternConvolution,
+                         ::testing::Values(ov::test::utils::DEVICE_CPU),
+                         LoraPatternBase::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke,
+                         LoraPatternMatmul,
+                         ::testing::Values(ov::test::utils::DEVICE_CPU),
+                         LoraPatternBase::getTestCaseName);
+
+}  // namespace
diff --git a/src/plugins/intel_cpu/tests/unit/brgemm_executor_test.cpp b/src/plugins/intel_cpu/tests/unit/brgemm_executor_test.cpp
index 35a29f97452d4b..9ae58561d4dfcd 100644
--- a/src/plugins/intel_cpu/tests/unit/brgemm_executor_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/brgemm_executor_test.cpp
@@ -28,18 +28,19 @@ void run_test(ov::element::Type rtPrec) {
     size_t K = 33;
     ov::intel_cpu::BrgemmKernel gemm(M, N, K, K, N, N, false, rtPrec);
     size_t nthr = 8;
-    bool is_bf16 = (rtPrec == ov::element::bf16);
+    bool is_f32 = (rtPrec == ov::element::f32);
     std::vector<T> a_data(M * K, (1.0f/33));
     std::vector<T> b_data(K * N, 4.0f);
     std::vector<float> c_data(nthr * M * N, 0.0f);
     std::vector<size_t> wsp(nthr * 4 * 1024, 0.0f);
-    std::vector<T> b_scracth(gemm.get_scratch_b_size(), 0.0f);
-    std::vector<T> a_scracth(gemm.get_scratch_a_size(), 0.0f);
-    if (is_bf16)
-        gemm.copy_buffer_b(b_data.data(), b_scracth.data());
+    std::vector<uint8_t> a_scratch(gemm.get_scratch_a_size(), 0.0f);
+    std::vector<uint8_t> b_scratch(gemm.get_scratch_b_size(), 0.0f);
+    if (!is_f32) {
+        gemm.copy_buffer_b(b_data.data(), b_scratch.data());
+    }
     auto m_block_size = gemm.get_mblk_size();
     auto m_blocks = (M + gemm.get_mblk_size() - 1) / m_block_size;
-    T* b_ptr = is_bf16 ? b_scracth.data() : b_data.data();
+    void* b_ptr = !is_f32 ? static_cast<void*>(b_scratch.data()) : static_cast<void*>(b_data.data());
     ov::parallel_for2d(nthr, m_blocks, [&](size_t i, size_t m_blk) {
         auto m_start = m_blk * m_block_size;
         auto m_end = std::min(m_start + m_block_size, M);
@@ -49,7 +50,7 @@ void run_test(ov::element::Type rtPrec) {
                          b_ptr,
                          c_data.data() + i * M * N + m_start * N,
                          wsp.data() + i * 4 * 1024,
-                         a_scracth.data());
+                         a_scratch.data());
     });
     ov::parallel_for(nthr, [&](size_t i){
         for (size_t m = 0; m < M; m++) {
@@ -73,9 +74,13 @@ TEST_P(BrgemmKernelTest, simpleGemmTest) {
         GTEST_SKIP();
     if (rtPrec == ov::element::f32 && !ov::with_cpu_x86_avx512_core())
         GTEST_SKIP();
+    if (rtPrec == ov::element::f16 && !ov::with_cpu_x86_avx512_core_fp16())
+        GTEST_SKIP();
 
     if (rtPrec == ov::element::bf16) {
         run_test<ov::bfloat16>(rtPrec);
+    } else if (rtPrec == ov::element::f16) {
+        run_test<ov::float16>(rtPrec);
     } else {
         run_test<float>(rtPrec);
     }
@@ -83,6 +88,6 @@ TEST_P(BrgemmKernelTest, simpleGemmTest) {
 
 INSTANTIATE_TEST_SUITE_P(BrgemmKernelUnitTest,
                          BrgemmKernelTest,
-                         ::testing::Values(ov::element::f32, ov::element::bf16),
+                         ::testing::Values(ov::element::f32, ov::element::bf16, ov::element::f16),
                          BrgemmKernelTest::getTestCaseName);
 } // namespace brgemmUnitTest
diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/search_sorted_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/search_sorted_shape_inference_test.cpp
new file mode 100644
index 00000000000000..ac0b4763b7bf5d
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/search_sorted_shape_inference_test.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "common_test_utils/test_assertions.hpp"
+#include "utils.hpp"
+
+using namespace ov;
+using namespace ov::intel_cpu;
+using ov::op::v0::Constant;
+using ov::op::v0::Parameter;
+using testing::HasSubstr;
+
+class SearchSortedShapeInferenceTest : public OpStaticShapeInferenceTest<op::v15::SearchSorted> {};
+
+TEST_F(SearchSortedShapeInferenceTest, same_dimensions_nd_inputs) {
+    const auto sorted = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto op = make_op(sorted, values);
+    const auto input_shapes = ShapeVector{StaticShape{1, 3, 6}, StaticShape{1, 3, 6}};
+    const auto output_shapes = shape_inference(op.get(), input_shapes);
+    EXPECT_EQ(output_shapes.size(), 1);
+    EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 6}));
+}
+
+TEST_F(SearchSortedShapeInferenceTest, scalar_values) {
+    const auto sorted = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto op = make_op(sorted, values);
+    const auto input_shapes = ShapeVector{StaticShape{3}, StaticShape{}};
+    const auto output_shapes = shape_inference(op.get(), input_shapes);
+    EXPECT_EQ(output_shapes.size(), 1);
+    EXPECT_EQ(output_shapes.front(), StaticShape{});
+}
+
+TEST_F(SearchSortedShapeInferenceTest, different_last_dim) {
+    const auto sorted = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto op = make_op(sorted, values);
+    const auto input_shapes = ShapeVector{StaticShape{1, 3, 7, 100}, StaticShape{1, 3, 7, 10}};
+    const auto output_shapes = shape_inference(op.get(), input_shapes);
+    EXPECT_EQ(output_shapes.size(), 1);
+    EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 7, 10}));
+}
+
+TEST_F(SearchSortedShapeInferenceTest, 1d_inputs) {
+    const auto sorted = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto op = make_op(sorted, values);
+    const auto input_shapes = ShapeVector{StaticShape{5}, StaticShape{20}};
+    const auto output_shapes = shape_inference(op.get(), input_shapes);
+    EXPECT_EQ(output_shapes.size(), 1);
+    EXPECT_EQ(output_shapes.front(), StaticShape({20}));
+}
+
+TEST_F(SearchSortedShapeInferenceTest, 1d_sequence) {
+    const auto sorted = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto op = make_op(sorted, values);
+    const auto input_shapes = ShapeVector{StaticShape{50}, StaticShape{1, 3, 7, 10}};
+    const auto output_shapes = shape_inference(op.get(), input_shapes);
+    EXPECT_EQ(output_shapes.size(), 1);
+    EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 7, 10}));
+}
+
+TEST_F(SearchSortedShapeInferenceTest, element_type_consistency_validation) {
+    const auto sorted = std::make_shared<Parameter>(element::i64, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i32, PartialShape::dynamic());
+    OV_EXPECT_THROW(std::ignore = make_op(sorted, values),
+                    NodeValidationFailure,
+                    testing::HasSubstr("must have the same element type"));
+}
+
+TEST_F(SearchSortedShapeInferenceTest, input_shapes_ranks_validation) {
+    const auto sorted = std::make_shared<Parameter>(element::i32, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i32, PartialShape::dynamic());
+    const auto op = make_op(sorted, values);
+    const auto input_shapes = ShapeVector{StaticShape{1, 3, 6}, StaticShape{1, 3, 6, 7}};
+    OV_EXPECT_THROW(std::ignore = shape_inference(op.get(), input_shapes),
+                    NodeValidationFailure,
+                    testing::HasSubstr("the ranks of the inputs have to be compatible"));
+}
+
+TEST_F(SearchSortedShapeInferenceTest, input_shapes_compatibility) {
+    const auto sorted = std::make_shared<Parameter>(element::i32, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i32, PartialShape::dynamic());
+    const auto op = make_op(sorted, values);
+    const auto input_shapes = ShapeVector{StaticShape{1, 3, 6}, StaticShape{1, 6, 6}};
+    OV_EXPECT_THROW(std::ignore = shape_inference(op.get(), input_shapes),
+                    NodeValidationFailure,
+                    testing::HasSubstr("All dimensions but the last one have to be compatible"));
+}
+
+TEST_F(SearchSortedShapeInferenceTest, scalar_sorted_sequence) {
+    const auto sorted = std::make_shared<Parameter>(element::i32, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i32, PartialShape::dynamic());
+    const auto op = make_op(sorted, values);
+    const auto input_shapes = ShapeVector{StaticShape{}, StaticShape{1, 6, 6}};
+    OV_EXPECT_THROW(std::ignore = shape_inference(op.get(), input_shapes),
+                    NodeValidationFailure,
+                    testing::HasSubstr("The sorted sequence input cannot be a scalar"));
+}
+
+TEST_F(SearchSortedShapeInferenceTest, scalar_values_and_ND_sequence) {
+    const auto sorted = std::make_shared<Parameter>(element::i32, PartialShape::dynamic());
+    const auto values = std::make_shared<Parameter>(element::i32, PartialShape::dynamic());
+    const auto op = make_op(sorted, values);
+    const auto input_shapes = ShapeVector{StaticShape{2, 3}, StaticShape{}};
+    OV_EXPECT_THROW(std::ignore = shape_inference(op.get(), input_shapes),
+                    NodeValidationFailure,
+                    testing::HasSubstr("the ranks of the inputs have to be compatible"));
+}
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp
index 8f4ae2c66334ee..d7933e2180fe6f 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp
@@ -26,7 +26,11 @@ struct rope : public primitive_base<rope> {
          size_t gather_rank = 0)
         : primitive_base(id, inputs),
           config(config),
-          gather_rank(gather_rank) {}
+          gather_rank(gather_rank) {
+            OPENVINO_ASSERT((!config.support_2d_rope
+                || (config.support_2d_rope && config.is_chatglm)),
+                "2D RoPE is currently only supported in Chatglm!");
+        }
 
     RoPE::Config config;
     size_t gather_rank = 0;
@@ -38,6 +42,7 @@ struct rope : public primitive_base<rope> {
         seed = hash_combine(seed, config.head_size);
         seed = hash_combine(seed, config.input_trans0213);
         seed = hash_combine(seed, config.is_chatglm);
+        seed = hash_combine(seed, config.support_2d_rope);
         seed = hash_combine(seed, config.is_interleaved);
         seed = hash_combine(seed, config.is_qwen);
         seed = hash_combine(seed, config.rotary_ndims);
@@ -58,6 +63,7 @@ struct rope : public primitive_base<rope> {
                config.head_size == rhs_casted.config.head_size &&
                config.input_trans0213 == rhs_casted.config.input_trans0213 &&
                config.is_chatglm == rhs_casted.config.is_chatglm &&
+               config.support_2d_rope == rhs_casted.config.support_2d_rope &&
                config.is_interleaved == rhs_casted.config.is_interleaved &&
                config.is_qwen == rhs_casted.config.is_qwen &&
                config.rotary_ndims == rhs_casted.config.rotary_ndims &&
@@ -73,6 +79,7 @@ struct rope : public primitive_base<rope> {
         ob << config.head_size;
         ob << config.input_trans0213;
         ob << config.is_chatglm;
+        ob << config.support_2d_rope;
         ob << config.is_interleaved;
         ob << config.is_qwen;
         ob << config.rotary_ndims;
@@ -88,6 +95,7 @@ struct rope : public primitive_base<rope> {
         ib >> config.head_size;
         ib >> config.input_trans0213;
         ib >> config.is_chatglm;
+        ib >> config.support_2d_rope;
         ib >> config.is_interleaved;
         ib >> config.is_qwen;
         ib >> config.rotary_ndims;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
index f869feba4a5334..049e7a29cb9c23 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
@@ -48,8 +48,8 @@ struct memory {
     virtual ~memory() = default;
     virtual void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) = 0;
     virtual void unlock(const stream& stream) = 0;
-    virtual event::ptr fill(stream& stream, unsigned char pattern) = 0;
-    virtual event::ptr fill(stream& stream) = 0;
+    virtual event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) = 0;
+    virtual event::ptr fill(stream& stream, bool blocking = true) = 0;
     // only supports gpu_usm
     virtual void* buffer_ptr() const { return nullptr; }
 
@@ -147,8 +147,8 @@ struct simple_attached_memory : memory {
 
     void* lock(const stream& /* stream */, mem_lock_type /* type */) override { return _pointer; }
     void unlock(const stream& /* stream */) override {}
-    event::ptr fill(stream& /* stream */, unsigned char) override { return nullptr; }
-    event::ptr fill(stream& /* stream */) override { return nullptr; }
+    event::ptr fill(stream& /* stream */, unsigned char, bool) override { return nullptr; }
+    event::ptr fill(stream& /* stream */, bool) override { return nullptr; }
     shared_mem_params get_internal_params() const override { return { shared_mem_type::shared_mem_empty, nullptr, nullptr, nullptr,
 #ifdef _WIN32
         nullptr,
diff --git a/src/plugins/intel_gpu/src/graph/crop.cpp b/src/plugins/intel_gpu/src/graph/crop.cpp
index e17cc3e5552849..e3ff36ceae38a5 100644
--- a/src/plugins/intel_gpu/src/graph/crop.cpp
+++ b/src/plugins/intel_gpu/src/graph/crop.cpp
@@ -50,7 +50,7 @@ std::vector<layout> crop_inst::calc_output_layouts(const crop_node& /*node*/, co
     std::vector<ShapeType> input_shapes = {
         impl_param.input_layouts[0].get<ShapeType>(),
     };
-    for (size_t i = 1; i < impl_param.input_layouts.size(); ++i) {
+    for (size_t i = 1; i < desc->input.size(); ++i) {
         input_shapes.push_back(impl_param.input_layouts[i].get<ShapeType>());
     }
     int64_t axis = desc->axis;
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index b7017c414c505f..7bdbc53ad54d16 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -461,7 +461,7 @@ bool crop_in_place_optimization::match(const program_node& node,
         return false;
     // if the node is marked as network output, prevent optimizations which would affect a form of its output,
     // unless debug flag is set
-    if (node.is_output() || crop_params.fused_desc.size() > 0 || node.is_in_shape_of_subgraph())
+    if (node.is_output() || crop_params.has_fused_primitives() || node.is_in_shape_of_subgraph())
         return false;
 
     const auto& crop_layout = crop_params.get_output_layout();
@@ -547,6 +547,9 @@ bool crop_in_place_optimization::optimize(crop_node& node) {
     auto input_layout = node.get_input_layout(0);
     auto crop_params = node.get_kernel_impl_params();
 
+    if (crop_params->has_fused_primitives())
+        return false;
+
     //  Regular crop
     //  crop input buffer
     //  |___________data____________|
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
index b42ab89eafd61a..5e8380f35dcb93 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -736,6 +736,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
 
             should_fuse |= input.is_type<strided_slice>();
 
+            should_fuse |= input.is_type<crop>();
+
             bool legacy_fusion = activation_node.get_dependencies().size() == 1 &&
                                  !input.can_be_optimized() &&
                                  !activation_node.is_constant() &&
@@ -920,7 +922,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                                       (parents[i].first->is_type<gather>()) ||
                                       (parents[i].first->is_type<reduce>() &&
                                        reduce_supports_fusings(parents[i].first->as<reduce>())) ||
-                                      (parents[i].first->is_type<lrn>());
+                                      (parents[i].first->is_type<lrn>()) ||
+                                      (parents[i].first->is_type<crop>());
             }
 
             // Disable fusion to a node on constant path when second input is in data flow
@@ -1045,6 +1048,15 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                 std::swap(fused_idx, peer_idx);
             }
 
+            // Avoid fusing with GEMM from the LoRA pattern, that can be optimized in case of empty adapters
+            if (parents[fused_idx].first->is_type<gemm>()) {
+                if (parents[peer_idx].first->is_type<fully_connected>() ||
+                    (parents[peer_idx].first->is_type<crop>() &&
+                     parents[peer_idx].first->get_dependency(0).is_type<fully_connected>())) {
+                    std::swap(fused_idx, peer_idx);
+                }
+            }
+
             auto fused_node = parents[fused_idx].first;
             auto peer_node = parents[peer_idx].first;
             if (lo.get_optimization_attributes().use_onednn_impls && lo.is_primitive_implemented_for_onednn(*fused_node)) {
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
index dff6b16d30a2ad..28ee84c4a4ec02 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -434,7 +434,7 @@ void remove_redundant_reorders::run(program& p) {
                 (input.is_type<one_hot>() || input.is_type<permute>() || input.is_type<mvn>() ||
                  input.is_type<concatenation>() || input.is_type<depth_to_space>() || input.is_type<region_yolo>() ||
                  input.is_type<detection_output>() || input.is_type<gather>() || input.is_type<broadcast>() ||
-                 input.is_type<select>() || input.is_type<eltwise>());
+                 input.is_type<select>() || input.is_type<eltwise>()) && !input.is_constant();
             if (!same_data_type && !allowed_dt_conversion_fuse)
                 continue;
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp
index 6c16618ac816d0..5692b6037a09e0 100644
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp
@@ -59,7 +59,7 @@ struct read_value_impl : public typed_primitive_impl<read_value> {
             if (instance.get_impl_params()->input_layouts.size() > 0) {
                 variable.get_memory()->copy_from(stream, instance.dep_memory(0), true);
             } else {
-                variable.get_memory()->fill(stream, 0);
+                variable.get_memory()->fill(stream);
             }
         }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp
index 41934847f899de..174ea1fa1767a9 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp
@@ -154,6 +154,13 @@ struct gemm_impl : multi_stage_primitive<gemm> {
     }
 
     event::ptr execute_impl(const std::vector<event::ptr>& events, gemm_inst& instance) override {
+        if (instance.get_input_layout(0).count() == 0 ||
+            instance.get_input_layout(1).count() == 0) {
+            stream& stream = instance.get_network().get_stream();
+            stream.enqueue_barrier();
+            return instance.output_memory_ptr()->fill(stream, false);
+        }
+
         if (need_indirect_load(instance))
             return execute_stage(events, instance, indirect_gemm);
         else
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp
index d8f0e45c25146f..8c08afc0428432 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp
@@ -37,7 +37,7 @@ struct count_nonzero_impl : typed_primitive_impl_ocl<count_nonzero> {
     event::ptr execute_impl(const std::vector<event::ptr>& events, count_nonzero_inst& instance) override {
         if (instance.get_impl_params()->input_layouts[0].count() == 0) {
             // set count of non-zero elements to 0 in case if input tensor is empty to have correct memory alloc for gather_nonzero
-            return instance.output_memory(0).fill(instance.get_network().get_stream(), 0);
+            return instance.output_memory(0).fill(instance.get_network().get_stream());
         } else {
             return parent::execute_impl(events, instance);
         }
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp
index f65768b8e6eb20..7764b7b0964d1c 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp
@@ -53,6 +53,7 @@ struct rope_impl : typed_primitive_impl_ocl<rope> {
 
         params.is_qwen = primitive->config.is_qwen;
         params.is_chatglm = primitive->config.is_chatglm;
+        params.support_2d_rope = primitive->config.support_2d_rope;
         params.transposed_input = primitive->config.input_trans0213;
 
         for (size_t i = 1; i < impl_param.input_layouts.size(); ++i) {
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index 13634b49fd9d96..095dc5fd45fa52 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -655,7 +655,7 @@ event::ptr primitive_inst::realloc_if_needed() {
         }
     }
 
-    // Clear out memory if if was previously reused, but now primitive can't be optimized
+    // Clear out memory if was previously reused, but now primitive can't be optimized
     if (!_node->is_type<concatenation>() && (_node->is_runtime_skippable() || _node->is_type<crop>())) {
         if (can_be_optimized()) {
             _max_output_layout_count = _deps[0].first->_max_output_layout_count;
@@ -663,7 +663,7 @@ event::ptr primitive_inst::realloc_if_needed() {
             return ev;
         } else if (_outputs[0] && dep_memory_ptr(0) &&
                    _network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) {
-            // Clear out memory if if was previously reused, but now primitive can't be optimized
+            // Clear out memory if was previously reused, but now primitive can't be optimized
             _outputs[0] = nullptr;
             _max_output_layout_count[0] = 0;
         }
@@ -1527,7 +1527,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
         }
 
         if (can_skip_execution) {
-            auto ev = get_network().get_stream().create_user_event(true);
+            auto ev = get_network().get_stream().aggregate_events(events);
             update_shape_done_by_other = false; // reset
             return ev;
         }
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 03cc8df8b4338c..3a3793e8ad764d 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -1898,6 +1898,7 @@ void program::load(cldnn::BinaryInputBuffer& ib) {
     _loaded_from_cache = true;
 
     processing_order.load(ib, *this);
+    set_layout_optimizer_attributes(*_layout_optimizer);
 
     {
         auto& kernels_cache = get_kernels_cache();
diff --git a/src/plugins/intel_gpu/src/graph/rope.cpp b/src/plugins/intel_gpu/src/graph/rope.cpp
index ea904916d4cf41..e168626f8d69a2 100644
--- a/src/plugins/intel_gpu/src/graph/rope.cpp
+++ b/src/plugins/intel_gpu/src/graph/rope.cpp
@@ -30,11 +30,24 @@ std::vector<layout> rope_inst::calc_output_layouts(rope_node const& node, kernel
 
     ShapeType output_shape = input0_shape;
 
-    if (desc->config.is_qwen || desc->config.is_chatglm) {
+    if (desc->config.is_qwen) {
         output_shape = { input0_shape[0],
                          input0_shape[1],
                          ov::Dimension(desc->config.head_cnt),
                          ov::Dimension(desc->config.head_size) };
+    } else if (desc->config.is_chatglm) {
+        if (desc->config.support_2d_rope) {
+            // input0_shape = [batch_size, seq_length]
+            output_shape = { input0_shape[0],
+                            ov::Dimension(desc->config.head_cnt),
+                            input0_shape[1],
+                            ov::Dimension(desc->config.head_size) };
+        } else {
+            output_shape = { input0_shape[0],
+                            input0_shape[1],
+                            ov::Dimension(desc->config.head_cnt),
+                            ov::Dimension(desc->config.head_size) };
+        }
     } else {
         auto input_slice_size = desc->config.slice_stop - desc->config.slice_start;
         if (input_slice_size > 0) {
@@ -63,6 +76,7 @@ std::string rope_inst::to_string(rope_node const& node) {
     rope_info.add("head_size", desc->config.head_size);
     rope_info.add("input_trans0213", desc->config.input_trans0213);
     rope_info.add("is_chatglm", desc->config.is_chatglm);
+    rope_info.add("support_2d_rope", desc->config.support_2d_rope);
     rope_info.add("is_interleaved", desc->config.is_interleaved);
     rope_info.add("is_qwen", desc->config.is_qwen);
     rope_info.add("rotary_ndims", desc->config.rotary_ndims);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl
index 36d4306b59ba79..38066b4461def4 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl
@@ -11,12 +11,22 @@ KERNEL(rope_ref)(
     const __global INPUT1_TYPE* cos_sin,
     __global OUTPUT_TYPE* output)
 {
+#ifdef SUPPORT_2D_ROPE
+    const uint p = get_global_id(0) / HEAD_COUNT;
+    const uint h = get_global_id(0) % HEAD_COUNT;
+    const uint b = get_global_id(1);//sequence length
+    const uint rf    = get_global_id(2);//max(HALF_ROTARY_NDIMS, HEAD_SIZE - ROTARY_NDIMS)
+    uint output_idx = OUTPUT_GET_INDEX(p, h, b, 0);
+#else
     const uint p = get_global_id(0);
     const uint b = get_global_id(1);
     const uint h = (uint)get_global_id(2) % HEAD_COUNT;
     const uint rf = (uint)get_global_id(2) / HEAD_COUNT;
+    uint output_idx = OUTPUT_GET_INDEX(p, b, h, 0);
+#endif
+
     uint r = rf < HALF_ROTARY_NDIMS ? rf * 2 : 0;
-    uint f = rf < HEAD_SIZE - ROTARY_NDIMS ? rf : 0;
+    uint f = rf < HEAD_SIZE - ROTARY_NDIMS ? rf * 2 : 0;
 
 #ifdef ENABLE_SLICE
     uint input_idx = GET_DATA_INDEX(SLICED_INPUT0, p, b, h * HEAD_SIZE, 0);
@@ -30,19 +40,18 @@ KERNEL(rope_ref)(
     uint cos_sin_b = b < INPUT1_FEATURE_NUM ? b : 0;
     uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_p, cos_sin_b, 0, 0);
 
-    uint output_idx = OUTPUT_GET_INDEX(p, b, h, 0);
-
-    INPUT1_TYPE cosv = cos_sin[cos_sin_idx + r];
-    INPUT1_TYPE sinv = cos_sin[cos_sin_idx + r + 1];
+    float cosv = convert_float(cos_sin[cos_sin_idx + r]);
+    float sinv = convert_float(cos_sin[cos_sin_idx + r + 1]);
 
-    INPUT0_TYPE in1 = input[input_idx + r];
-    INPUT0_TYPE in2 = input[input_idx + r + 1];
+    float in1 = convert_float(input[input_idx + r]);
+    float in2 = convert_float(input[input_idx + r + 1]);
 
-    output[output_idx + r] = cosv * in1 - sinv * in2;
-    output[output_idx + r + 1] = sinv * in1 + cosv * in2;
+    output[output_idx + r] = TO_OUTPUT_TYPE(cosv * in1 - sinv * in2);
+    output[output_idx + r + 1] = TO_OUTPUT_TYPE(sinv * in1 + cosv * in2);
 
 #ifdef ENABLE_IO_COPY
     output[output_idx + ROTARY_NDIMS + f] = input[input_idx + ROTARY_NDIMS + f];
+    output[output_idx + ROTARY_NDIMS + f + 1] = input[input_idx + ROTARY_NDIMS + f + 1];
 #endif
 }
 #endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/slice_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/slice_ref.cl
index c9e2c0688e1968..ba36ee859412ec 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/slice_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/slice_ref.cl
@@ -24,7 +24,7 @@
     out_name[4] = in_prefix##_VAL4;
 #endif
 
-KERNEL(slice_ref)(OPTIONAL_SHAPE_INFO_ARG 
+KERNEL(slice_ref)(OPTIONAL_SHAPE_INFO_ARG
                   const __global INPUT0_TYPE* restrict input,
                   START_BUFFER
                   STEP_BUFFER
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp
index a9e0818aeae2f5..a48632f6c45509 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp
@@ -70,6 +70,9 @@ JitConstants RoPEKernelBase::GetJitConstants(const rope_params& params, RoPEKern
     if (params.is_qwen) {
         jit.AddConstant(MakeJitConstant("QWEN", true));
     } else if (params.is_chatglm) {
+        if (params.support_2d_rope) {
+            jit.AddConstant(MakeJitConstant("SUPPORT_2D_ROPE", true));
+        }
         jit.AddConstant(MakeJitConstant("CHATGLM", true));
     } else {
         jit.AddConstant(MakeJitConstant("RotateHalf", true));
@@ -85,10 +88,22 @@ RoPEKernelBase::DispatchData RoPEKernelBase::SetDefault(const rope_params& param
 
     std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, { Tensor::DataChannelName::FEATURE },
                                                                      { Tensor::DataChannelName::Y, Tensor::DataChannelName::X }};
-    if (params.is_chatglm || params.is_qwen) {
+    if (params.is_qwen) {
         dispatchData.gws = {input.Batch().v,
                             input.Feature().v,
                             params.head_cnt * std::max(params.rotary_ndims / 2ul, params.head_size - params.rotary_ndims)};
+    } else if (params.is_chatglm) {
+        if (params.support_2d_rope) {
+            // input  [batch_size, seq_length]
+            // output [batch_size, head_count, seq_length, half_rotary_ndims]
+            dispatchData.gws = {input.Batch().v * params.head_cnt,
+                                input.Feature().v,
+                                params.rotary_ndims / 2ul};
+        } else {
+            dispatchData.gws = {input.Batch().v,
+                                input.Feature().v,
+                                params.head_cnt * (params.rotary_ndims / 2ul)};
+        }
     } else {
         dispatchData.gws = {output.Batch().v,
                             output.Feature().v,
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.h
index 5d55fd082765e8..472131eba5d82f 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.h
@@ -24,6 +24,7 @@ struct rope_params : public base_params {
 
     bool is_qwen = false;
     bool is_chatglm = false;
+    bool support_2d_rope = false;
     bool transposed_input = false;
 };
 
diff --git a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp
index 9d7d6854009316..d655e297e4a2c6 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp
@@ -45,6 +45,12 @@ void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr<ov::
     p.add_primitive(*op, prim);
 }
 
+bool IsReadValueOp(std::shared_ptr<ov::Node> op) {
+    return ov::is_type<ov::op::v3::ReadValue>(op) ||
+           ov::is_type<ov::op::v6::ReadValue>(op) ||
+           ov::is_type<ov::intel_gpu::op::ReadValue>(op);
+}
+
 void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v3::ReadValue>& op) {
     validate_inputs_count(op, {0, 1});
     CreateVariableAccessPrimitive<cldnn::read_value>(p, op, op->get_variable_id());
@@ -57,6 +63,9 @@ void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr<ov::intel_gpu::o
 
 void CreateAssignOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v3::Assign>& op) {
     validate_inputs_count(op, {1});
+    if (IsReadValueOp(op->get_input_node_shared_ptr(0))) {
+        return;
+    }
     CreateVariableAccessPrimitive<cldnn::assign>(p, op, op->get_variable_id());
 }
 
@@ -67,6 +76,9 @@ void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v6::Read
 
 void CreateAssignOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v6::Assign>& op) {
     validate_inputs_count(op, {1});
+    if (IsReadValueOp(op->get_input_node_shared_ptr(0))) {
+        return;
+    }
     CreateVariableAccessPrimitive<cldnn::assign>(p, op, op->get_variable_id());
 }
 
diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp
index 7ee587e612ad3d..4ea7851b3f8c58 100644
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -575,7 +575,8 @@ std::vector<ov::PropertyName> Plugin::get_supported_internal_properties() const
             ov::PropertyName{ov::internal::exclusive_async_requests.name(), ov::PropertyMutability::RW},
             ov::PropertyName{ov::internal::compiled_model_runtime_properties.name(), ov::PropertyMutability::RO},
             ov::PropertyName{ov::internal::compiled_model_runtime_properties_supported.name(), ov::PropertyMutability::RO},
-            ov::PropertyName{ov::internal::query_model_ratio.name(), PropertyMutability::RW}};
+            ov::PropertyName{ov::internal::query_model_ratio.name(), PropertyMutability::RW},
+            ov::PropertyName{ov::internal::caching_with_mmap.name(), PropertyMutability::RO}};
     return supported_internal_properties;
 }
 
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 40c7ab48c486cb..f173e378fca3f9 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -862,7 +862,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         const size_t zp_pad_size = device_info.supports_immad ? 16 : 32;
         manager.register_pass<ov::intel_gpu::BroadcastAndPadZeroPointBuffers>(zp_pad_size, device_info.supports_immad);
 
-        manager.register_pass<ov::pass::RoPEFusion>();
+        manager.register_pass<ov::pass::RoPEFusion>(true);
         pass_config->disable<ov::pass::RoPEFusionGPTJ>();
         pass_config->disable<ov::pass::RoPEFusionIOSlicing>();
         pass_config->disable<ov::pass::RoPEShareCosSin>();
diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp
index b24ddbd314a0cd..6b1c8d0cfc993f 100644
--- a/src/plugins/intel_gpu/src/plugin/variable_state.cpp
+++ b/src/plugins/intel_gpu/src/plugin/variable_state.cpp
@@ -70,6 +70,11 @@ void VariableState::set_state(const ov::SoPtr<ov::ITensor>& state) {
     m_layout.set_partial_shape(src_shape);
     update_device_buffer();
 
+    if (actual_size == 0) {
+        set();
+        return;
+    }
+
     // check whether the src tensor is padded
     std::vector<size_t> src_stride_no_pad(src_rank, 1);
     std::vector<int32_t> upper_pad(src_rank, 0);
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
index a2ddc7dd2a4dff..f7e5ada9e24ef1 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
@@ -91,15 +91,15 @@ void gpu_buffer::unlock(const stream& stream) {
     }
 }
 
-event::ptr gpu_buffer::fill(stream& stream) {
+event::ptr gpu_buffer::fill(stream& stream, bool blocking) {
     if (_bytes_count == 0) {
         GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl;
         return stream.create_user_event(true);
     }
-    return fill(stream, 0);
+    return fill(stream, 0, blocking);
 }
 
-event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) {
+event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern, bool blocking) {
     if (_bytes_count == 0) {
         GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl;
         return stream.create_user_event(true);
@@ -109,6 +109,9 @@ event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) {
     cl::Event& ev_ocl = downcast<ocl_event>(ev.get())->get();
     try {
         cl_stream.get_cl_queue().enqueueFillBuffer<unsigned char>(_buffer, pattern, 0, size(), nullptr, &ev_ocl);
+        if (blocking) {
+            ev_ocl.wait();
+        }
     } catch (cl::Error const& err) {
         OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
     }
@@ -272,15 +275,15 @@ gpu_image2d::gpu_image2d(ocl_engine* engine,
     _slice_pitch = _buffer.getImageInfo<CL_IMAGE_SLICE_PITCH>();
 }
 
-event::ptr gpu_image2d::fill(stream& stream) {
+event::ptr gpu_image2d::fill(stream& stream, bool blocking) {
     if (_bytes_count == 0) {
         GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl;
         return stream.create_user_event(true);
     }
-    return fill(stream, 0);
+    return fill(stream, 0, blocking);
 }
 
-event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern) {
+event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern, bool blocking) {
     if (_bytes_count == 0) {
         GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl;
         return stream.create_user_event(true);
@@ -291,6 +294,9 @@ event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern) {
     cl_uint4 pattern_uint4 = {{pattern, pattern, pattern, pattern}};
     try {
         cl_stream.get_cl_queue().enqueueFillImage(_buffer, pattern_uint4, {0, 0, 0}, {_width, _height, 1}, 0, &ev_ocl);
+        if (blocking) {
+            ev_ocl.wait();
+        }
     } catch (cl::Error const& err) {
         OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
     }
@@ -509,7 +515,7 @@ void gpu_usm::unlock(const stream& /* stream */) {
     }
 }
 
-event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) {
+event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, bool blocking) {
     if (_bytes_count == 0) {
         GPU_DEBUG_TRACE_DETAIL << "Skip gpu_usm::fill for 0 size tensor" << std::endl;
         return stream.create_user_event(true);
@@ -517,14 +523,12 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) {
     auto& cl_stream = downcast<ocl_stream>(stream);
     auto ev = stream.create_base_event();
     cl::Event& ev_ocl = downcast<ocl_event>(ev.get())->get();
-    // enqueueFillUsm call will never finish. Driver bug? Uncomment when fixed. Some older drivers doesn't support enqueueFillUsm call at all.
-    // cl_stream.get_usm_helper().enqueue_fill_mem<unsigned char>(cl_stream.get_cl_queue(), _buffer.get(), pattern, _bytes_count, nullptr, &ev_ocl)
-    // Workarounded with enqeue_memcopy. ToDo: Remove below code. Uncomment above.
-    std::vector<unsigned char> temp_buffer(_bytes_count, pattern);
-    // TODO: Do we really need blocking call here? Non-blocking one causes accuracy issues right now, but hopefully it can be fixed in more performant way.
-    const bool blocking = true;
     try {
-        cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), _buffer.get(), temp_buffer.data(), _bytes_count, blocking, nullptr, &ev_ocl);
+        cl_stream.get_usm_helper().enqueue_fill_mem(
+                cl_stream.get_cl_queue(), _buffer.get(), static_cast<const void*>(&pattern), sizeof(unsigned char), _bytes_count, nullptr, &ev_ocl);
+        if (blocking) {
+            ev_ocl.wait();
+        }
     } catch (cl::Error const& err) {
         OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
     }
@@ -532,7 +536,7 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) {
     return ev;
 }
 
-event::ptr gpu_usm::fill(stream& stream) {
+event::ptr gpu_usm::fill(stream& stream, bool blocking) {
     // event::ptr ev{ new base_event(_context), false };
     // cl::Event ev_ocl = downcast<ocl_event>(ev.get())->get();
     // cl::usm::enqueue_set_mem(cl_stream.get_cl_queue(), _buffer.get(), 0, _bytes_count, nullptr, &ev_ocl);
@@ -543,7 +547,7 @@ event::ptr gpu_usm::fill(stream& stream) {
         GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl;
         return stream.create_user_event(true);
     }
-    return fill(stream, 0);
+    return fill(stream, 0, blocking);
 }
 
 event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) {
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp
index e2a68537cdc69e..e37518de3982a8 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp
@@ -32,8 +32,8 @@ struct gpu_buffer : public lockable_gpu_mem, public memory {
 
     void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
     void unlock(const stream& stream) override;
-    event::ptr fill(stream& stream, unsigned char pattern) override;
-    event::ptr fill(stream& stream) override;
+    event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override;
+    event::ptr fill(stream& stream, bool blocking = true) override;
     shared_mem_params get_internal_params() const override;
     const cl::Buffer& get_buffer() const {
         assert(0 == _lock_count);
@@ -58,8 +58,8 @@ struct gpu_image2d : public lockable_gpu_mem, public memory {
 
     void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
     void unlock(const stream& stream) override;
-    event::ptr fill(stream& stream, unsigned char pattern) override;
-    event::ptr fill(stream& stream) override;
+    event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override;
+    event::ptr fill(stream& stream, bool blocking = true) override;
     shared_mem_params get_internal_params() const override;
     const cl::Image2D& get_buffer() const {
         assert(0 == _lock_count);
@@ -112,8 +112,8 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
     cl::UsmMemory& get_buffer() { return _buffer; }
     void* buffer_ptr() const override { return _buffer.get(); }
 
-    event::ptr fill(stream& stream, unsigned char pattern) override;
-    event::ptr fill(stream& stream) override;
+    event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override;
+    event::ptr fill(stream& stream, bool blocking = true) override;
     shared_mem_params get_internal_params() const override;
 
     event::ptr copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override;
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp
index 9565036f7b452d..741014b461e7f0 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp
@@ -44,5 +44,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_RoPETestLlama2,
                          ::testing::Values(ov::test::utils::DEVICE_GPU),
                          RoPETestLlama2Slice::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLM,
+                         RoPETestChatGLM2DRoPEStridedSlice,
+                         ::testing::Values(ov::test::utils::DEVICE_GPU),
+                         RoPETestChatGLM2DRoPEStridedSlice::getTestCaseName);
+
+
 }  // namespace test
 }  // namespace ov
diff --git a/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp
index d4c50ec84ac78a..5d259a1a1862fc 100644
--- a/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp
@@ -38,8 +38,10 @@ class EltwiseFusingTest : public ::BaseFusingTest<eltwise_test_params> {
         network network_fused(this->engine, this->topology_fused, cfg_fused);
 
         auto inputs = network_fused.get_input_ids();
-        network_fused.set_input_data("input", input_prim);
-        network_not_fused.set_input_data("input", input_prim);
+        if (std::find(inputs.begin(), inputs.end(), "input") != inputs.end()) {
+            network_fused.set_input_data("input", input_prim);
+            network_not_fused.set_input_data("input", input_prim);
+        }
         if (std::find(inputs.begin(), inputs.end(), "input2") != inputs.end()) {
             network_fused.set_input_data("input2", input_prim2);
             network_not_fused.set_input_data("input2", input_prim2);
@@ -699,3 +701,27 @@ TEST_P(eltwise_fusing_reorders, reorders_for_data_type) {
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fusing_reorders, ::testing::ValuesIn(std::vector<eltwise_test_params>{
     eltwise_test_params{ { 1, 16, 16, 2 }, data_types::f16, data_types::f16, format::bfyx,  data_types::f16,  format::bfyx, eltwise_mode::max, 4, 6 },
 }));
+
+class eltwise_with_constant_input : public EltwiseFusingTest {};
+TEST_P(eltwise_with_constant_input, basic) {
+    auto p = GetParam();
+    create_topologies(data("eltwise_data", get_mem(get_input_layout2(p), -10, 10)),
+                      data("eltwise_data1", get_mem(get_input_layout2(p), -10, 10)),
+                      eltwise("eltwise", {input_info("eltwise_data"), input_info("eltwise_data1")}, p.mode, p.default_type),
+                      reorder("out",
+                              input_info("eltwise"),
+                              p.default_format,
+                              data_types::f32,
+                              std::vector<float>(),
+                              cldnn::reorder_mean_mode::subtract,
+                              cldnn::padding(),
+                              true)
+                              );
+
+    tolerance = default_tolerance(p.input_type);
+    execute(p, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_with_constant_input, ::testing::ValuesIn(std::vector<eltwise_test_params>{
+    eltwise_test_params{ CASE_ELTWISE_FP16_1, 0, 0},
+}));
diff --git a/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp b/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp
index a590fb9299a777..eb0f63c651e50d 100644
--- a/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp
+++ b/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp
@@ -81,8 +81,9 @@ class BaseFusingTest : public ::testing::TestWithParam<T> {
         ASSERT_EQ(outputs_ref.size(), outputs_fused.size());
         ASSERT_EQ(outputs_ref.size(), size_t(1));
 
+        std::vector<float> val_opt;
         auto val_ref = get_output_values_to_float(not_fused, outputs_ref.begin()->second);
-        auto val_opt = get_output_values_to_float(fused, outputs_fused.begin()->second);
+        ASSERT_NO_THROW(val_opt = get_output_values_to_float(fused, outputs_fused.begin()->second));
         ASSERT_EQ(val_ref.size(), val_opt.size());
         for (size_t i = 0; i < val_ref.size(); i++) {
             ASSERT_NEAR(val_ref[i], val_opt[i], tolerance)
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/non_zero_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/non_zero_gpu_test.cpp
index 37a1ba8b982414..80122193265ebc 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/non_zero_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/non_zero_gpu_test.cpp
@@ -556,7 +556,7 @@ TEST(non_zero_gpu, empty_input) {
 
     // Put some value into out buffer to ensure that it's non empty
     // That is needed to ensure that implementation correctly handles the cases when input tensor is empty and set count non zero to 0
-    count_nonzero_inst->output_memory(0).fill(engine.get_service_stream(), 1);
+    count_nonzero_inst->output_memory(0).fill(engine.get_service_stream(), 1, true);
     engine.get_service_stream().finish();
 
     auto count_nonzero_impl = count_nonzero_inst->get_impl();
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
index 65ec475df6b986..fef9470545482a 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
@@ -30,7 +30,7 @@ void registerNPUWOptions(OptionsDesc& desc);
 DEFINE_OPT(NPU_USE_NPUW, bool, false, use_npuw, CompileTime);
 DEFINE_OPT(NPUW_DEVICES, std::string, "NPU,CPU", npuw::devices, CompileTime);
 DEFINE_OPT(NPUW_SUBMODEL_DEVICE, std::string, "", npuw::submodel_device, CompileTime);
-DEFINE_OPT(NPUW_ONLINE_PIPELINE, std::string, "REP", npuw::partitioning::online::pipeline, CompileTime);
+DEFINE_OPT(NPUW_ONLINE_PIPELINE, std::string, "REG", npuw::partitioning::online::pipeline, CompileTime);
 DEFINE_OPT(NPUW_ONLINE_AVOID, std::string, "", npuw::partitioning::online::avoid, CompileTime);
 DEFINE_OPT(NPUW_ONLINE_ISOLATE, std::string, "", npuw::partitioning::online::isolate, CompileTime);
 DEFINE_OPT(NPUW_ONLINE_NO_FOLD, std::string, "", npuw::partitioning::online::nofold, CompileTime);
diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
index 31fa52c3878598..059977ee47a063 100644
--- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
@@ -67,8 +67,8 @@ namespace online {
  * @brief
  * Type: std::string.
  * Specify which partitioning pipeline to run.
- * Possible values: "NONE", "INIT", "JUST", "REP", "COMPUTE".
- * Default value: "REP".
+ * Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE".
+ * Default value: "REG".
  */
 static constexpr ov::Property<std::string> pipeline{"NPUW_ONLINE_PIPELINE"};
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 2fe90eb82c41bb..a312a806cac4bc 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -516,11 +516,6 @@ std::string ov::npuw::CompiledModel::global_mem_device() const {
 }
 
 std::string ov::npuw::CompiledModel::funcall_mem_device(const std::size_t idx) const {
-    // FIXME: currently we allocate intermediate tensors for EVERY submodel.
-    //        It's not feasible to allocate them in L0 due to high memory consumption.
-    //        Until we make such memory reusable, hard-coding those tensors to CPU.
-    return "CPU";
-
     // Force globally set device if set
     const std::string device_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
     if (!device_alloc.empty()) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index 4152d08275ba6d..038c1bb176b029 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -46,6 +46,8 @@ class CompiledModel : public ov::ICompiledModel {
     // FIXME: This class has many friends..
     friend class IBaseInferRequest;
     friend class JustInferRequest;
+    friend class MemAccessSim;
+    friend class FuncMemMgr;
 
     bool compile_for_success(std::size_t id);
     bool compile_for_device(std::size_t id, const std::string& device_to_try);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
index fbbabf083bccd8..c4e2c3ee98b676 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -20,8 +20,173 @@
 #include "util.hpp"
 #include "weights_bank.hpp"
 
+ov::npuw::MemAccessSim::MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model) {
+    LOG_VERB("Running memory access simulation...");
+    LOG_BLOCK();
+
+    // Initialize the read list
+    m_read_list.resize(compiled_model->m_compiled_submodels.size());
+
+    // Initialize read counters for tensors in the graph:
+    // 1. Interconnect
+    for (const auto& kvp : compiled_model->m_submodels_input_to_prev_output) {
+        const auto& read_to = kvp.first;     // who reads
+        const auto& read_from = kvp.second;  // reads what
+
+        if (read_to == CompiledModel::NO_LINK || read_from == CompiledModel::NO_LINK) {
+            continue;
+        }
+
+        // Record # of reads for this particular Source
+        m_remaining_reads[read_from]++;
+
+        // Record a read request for this particular Subgraph (who reads the Source)
+        m_read_list[read_to.first].push_back(read_from);
+    }
+    // 2. Global model's outputs
+    for (auto&& read_from : compiled_model->m_outputs_to_submodels_outputs) {
+        m_remaining_reads[read_from]++;
+    }
+
+    LOG_VERB("Done");
+}
+
+const ov::npuw::MemAccessSim::ReadList& ov::npuw::MemAccessSim::read_list(std::size_t idx) const {
+    return m_read_list.at(idx);
+}
+
+std::size_t ov::npuw::MemAccessSim::remaining_reads(const LinkFrom& from) {
+    return m_remaining_reads.at(from);
+}
+
+void ov::npuw::MemAccessSim::register_read(const LinkFrom& from) {
+    m_remaining_reads.at(from)--;
+}
+
+ov::npuw::FuncMemMgr::FuncMemMgr(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
+    : m_sim(compiled_model),
+      m_model(compiled_model) {}
+
+void ov::npuw::FuncMemMgr::set_alloc(AllocFcn&& fcn) {
+    m_alloc = std::move(fcn);
+}
+
+void ov::npuw::FuncMemMgr::assign_memory() {
+    LOG_VERB("Assigning function memory...");
+    LOG_BLOCK();
+
+    const auto num_submodels = m_model->m_compiled_submodels.size();
+
+    // Walk over the subgraphs, pre-allocate and pre-assign tensors to the subgraphs
+    // outputs.
+    for (std::size_t idx = 0u; idx < num_submodels; idx++) {
+        LOG_VERB("Process Subgraph[" << idx << "]");
+        LOG_BLOCK();
+        const auto& comp_model_desc = m_model->m_compiled_submodels[idx];
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            // no model & no funcall - optimized out, do nothing
+            continue;
+        }
+
+        // Simulate subgraph execution: poll its input list first
+        const auto& read_list = m_sim.read_list(idx);
+
+        // Now, get the outputs for the subgraph. If it is "regular", there's
+        // nothing to do - this subgraph owns its outputs on its own.
+        // If it is a function, though - look up in the function's memory storage.
+        if (comp_model_desc.replaced_by) {
+            const auto real_idx = comp_model_desc.replaced_by.value();
+            const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx];
+
+            const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
+            for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
+                const LinkFrom this_out = LinkFrom{idx, out_idx};
+                assign(this_out);
+            }
+        }
+
+        // Here happens the imaginary execution... Hocus pocus, done - that's a
+        // simulation after all
+        // After the execution, mark that the read_list was read.
+        for (auto&& from : read_list) {
+            m_sim.register_read(from);
+        }
+        LOG_VERB("Done");
+    }
+
+    // Report memory residency
+    for (auto&& m : m_memory) {
+        LOG_VERB("Function " << m.first.first << "/out port " << m.first.second << " : maximum memory residency "
+                             << m.second.size() << " tensor(s)");
+    }
+
+    LOG_VERB("Done");
+}
+
+void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) {
+    // This method is the center of the function memory management.
+    // The logic is simple:
+    // - Look for an output tensor to reuse
+    //   - If there's one, assign it to this allocation
+    //   - If there's none, allocate a new tensor
+    // - How a tensor to reuse is piced:
+    //   1. It should exist
+    //   2. It's "remaining reads" count should be 0 (all planned reads
+    //      happened at this point).
+    // The tensor storage is organized like this:
+    // - Function: Here we use .replaced_by as a function identifier; taken from `from`
+    //   - Output index: taken from `from`
+    //     - A vector of resident tensors
+
+    LOG_VERB("Assinging tensor for Subgraph[" << from.first << "]/" << from.second << "...");
+    LOG_BLOCK();
+
+    const auto& comp_model_desc = m_model->m_compiled_submodels[from.first];
+    NPUW_ASSERT(comp_model_desc.replaced_by.has_value());
+
+    const auto real_idx = comp_model_desc.replaced_by.value();
+
+    FO func_output = {real_idx, from.second};
+    auto& assigned_memory = m_memory[func_output];
+    auto asgn_iter = std::find_if(assigned_memory.begin(), assigned_memory.end(), [&](Assignment& a) {
+        return m_sim.remaining_reads(a.from) == 0u;
+    });
+    if (asgn_iter != assigned_memory.end()) {
+        // Reassign this memory slot to the new "from"
+        asgn_iter->from = from;
+        m_table[from] = asgn_iter->ptr;
+    } else {
+        // No free space at this point - allocate a new tensor
+        const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx];
+        const auto& proto_comp_model = proto_comp_model_desc.compiled_model;
+
+        const auto& oport = proto_comp_model->outputs()[from.second];
+        ov::Shape oshape = oport.get_shape();
+
+        if (proto_comp_model_desc.spatial) {
+            oshape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
+        }
+        const auto& device = m_model->funcall_mem_device(real_idx);
+        TensorPtr new_tensor = m_alloc(oport.get_element_type(), oshape, device);
+        NPUW_ASSERT(new_tensor);
+
+        assigned_memory.push_back(Assignment{new_tensor, from});
+        m_table[from] = new_tensor;
+    }
+    LOG_VERB("Done");
+}
+
+ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) {
+    return m_table.at(from);
+}
+
 ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
-    : IBaseInferRequest(compiled_model) {
+    : IBaseInferRequest(compiled_model),
+      m_func_mem_mgr(compiled_model) {
+    using namespace std::placeholders;
+    m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3));
+    m_func_mem_mgr.assign_memory();
+
     m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
     if (m_use_function_pipelining) {
         LOG_WARN("Function call pipelining is enabled for " << m_npuw_model->m_name
@@ -67,27 +232,20 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
                     for (auto&& p : proto_comp_model_desc.spatial->params) {
                         const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
                         m_spatial_io[real_idx].input_tails[p.idx] =
-                            allocTensor(iport, m_npuw_model->funcall_mem_device(real_idx));
+                            allocOut(iport, m_npuw_model->funcall_mem_device(real_idx));
                     }
                     const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
                     for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
                         const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
                         m_spatial_io[real_idx].output_tails[out_idx] =
-                            allocTensor(oport, m_npuw_model->funcall_mem_device(real_idx));
+                            allocOut(oport, m_npuw_model->funcall_mem_device(real_idx));
                     }
                 }
             }  // if(spatial)
 
             for (size_t out_idx = 0; out_idx < num_outputs; out_idx++) {
-                const auto& port = proto_comp_model->outputs()[out_idx];
-                ov::Shape shape = port.get_shape();
-
-                // If the subgraph is spatial, promote the output size to the full vector size
-                if (proto_comp_model_desc.spatial) {
-                    shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
-                }
-                m_funcall_result[LinkFrom{i, out_idx}] =
-                    allocTensor(port.get_element_type(), shape, m_npuw_model->funcall_mem_device(real_idx));
+                const auto from = LinkFrom{i, out_idx};
+                m_funcall_result[from] = m_func_mem_mgr.get_tensor(from);
             }
             if (real_idx != i) {
                 // If this function call is NOT the function body, do nothing here - the original
@@ -152,7 +310,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
     LOG_INFO("Preallocating input tensors...");
     for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
         const auto& port = m_npuw_model->inputs()[i];
-        ov::SoPtr<ov::ITensor> allocated = allocTensor(port, m_npuw_model->global_mem_device());
+        ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
         m_input_tensors.push_back(allocated);
         m_input_allocated.insert(allocated->data());
         m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
@@ -174,7 +332,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         const auto& tensor =
             funcall_result_iter != m_funcall_result.end()
                 ? funcall_result_iter->second  // Function calls have their tensors allocated, so just use one
-                : allocTensor(port, m_npuw_model->global_mem_device());
+                : allocOut(port, m_npuw_model->global_mem_device());
 
         m_output_tensors.push_back(tensor);
         m_port_to_tensor[port] = TensorStorage{tensor, true};
@@ -920,27 +1078,22 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
     }  // if (replaced_by)
 }
 
-ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor(const ov::element::Type type,
-                                                               const ov::Shape& shape,
-                                                               const std::string& device) {
+ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type,
+                                                         const ov::Shape& shape,
+                                                         const std::string& device) {
     if (device == "CPU" || ov::shape_size(shape) == 0) {
         return ov::get_tensor_impl(ov::Tensor(type, shape));
     }
 
-    ov::SoPtr<ov::ITensor> remote_tensor;
-    ov::Tensor allocated_tensor;
-    {
-        std::lock_guard<std::mutex> guard(m_alloc_mutex);
-        m_remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
-        remote_tensor = m_remote_ctx->create_host_tensor(type, shape);
-        allocated_tensor = ov::make_tensor(remote_tensor);
-    }
-    return ov::get_tensor_impl(allocated_tensor);
+    std::lock_guard<std::mutex> guard(m_alloc_mutex);
+    auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
+    auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
+    return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
 }
 
-ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor(const ov::Output<const ov::Node>& node,
-                                                               const std::string& device) {
-    return allocTensor(node.get_element_type(), node.get_shape(), device);
+ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output<const ov::Node>& node,
+                                                         const std::string& device) {
+    return allocMem(node.get_element_type(), node.get_shape(), device);
 }
 
 void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
index 7335b54c30062e..88838d8b39d75f 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -22,6 +22,56 @@ namespace npuw {
 class CompiledModel;
 class AsyncInferRequest;
 
+using LinkFrom = std::pair<std::size_t /* Subrequest index */
+                           ,
+                           std::size_t /* Subrequest output index */
+                           >;          // FIXME: This is a third, if not fourth, definitiion of such structure
+
+using TensorPtr = ov::SoPtr<ov::ITensor>;
+
+class MemAccessSim {
+public:
+    explicit MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
+
+    using ReadList = std::list<LinkFrom>;
+    const ReadList& read_list(std::size_t idx) const;
+
+    std::size_t remaining_reads(const LinkFrom& from);
+    void register_read(const LinkFrom& from);
+
+private:
+    std::map<LinkFrom, std::size_t> m_remaining_reads;
+    std::vector<ReadList> m_read_list;
+};
+
+class FuncMemMgr {
+    MemAccessSim m_sim;
+    std::shared_ptr<ov::npuw::CompiledModel> m_model;
+
+    void assign(const LinkFrom& from);
+
+    // Function ID -> Output port number
+    using FO = std::pair<std::size_t, std::size_t>;
+    struct Assignment {
+        TensorPtr ptr;
+        LinkFrom from;
+    };
+    std::map<FO, std::vector<Assignment>> m_memory;  // Dynamic assignment table
+    std::map<LinkFrom, TensorPtr> m_table;           // Static allocation/assignment table
+
+public:
+    explicit FuncMemMgr(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
+
+    using AllocFcn = std::function<TensorPtr(const ov::element::Type&, const ov::Shape&, const std::string&)>;
+    void set_alloc(AllocFcn&& fcn);
+    void assign_memory();
+
+    TensorPtr get_tensor(const LinkFrom& from);
+
+private:
+    AllocFcn m_alloc;
+};
+
 class JustInferRequest final : public IBaseInferRequest {
 public:
     explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
@@ -64,15 +114,11 @@ class JustInferRequest final : public IBaseInferRequest {
     void connect_subrequests();
     void recreate_subrequests(std::size_t idx);
 
-    ov::SoPtr<ov::ITensor> allocTensor(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
-    ov::SoPtr<ov::ITensor> allocTensor(const ov::Output<const ov::Node>& node, const std::string& device);
+    TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
+    TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
 
-    using LinkFrom = std::pair<std::size_t /* Subrequest index */
-                               ,
-                               std::size_t /* Subrequest output index */
-                               >;          // FIXME: This is a third, if not fourth, definitiion of such structure
-    using TensorPtr = ov::SoPtr<ov::ITensor>;
-    std::map<LinkFrom, TensorPtr> m_funcall_result;
+    FuncMemMgr m_func_mem_mgr;                       // Owns memory
+    std::map<LinkFrom, TensorPtr> m_funcall_result;  // Provides a convenient link
 
     bool is_pipelined(std::size_t idx) const;
     bool m_use_function_pipelining = false;
@@ -103,8 +149,6 @@ class JustInferRequest final : public IBaseInferRequest {
     std::vector<GlobalIO> m_subrequests_gio;
 
     std::mutex m_alloc_mutex;
-    std::shared_ptr<ov::IRemoteContext> m_remote_ctx = nullptr;
-
     std::unordered_set<void*> m_input_allocated;
 };
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
index 46b6cb7b12681d..a66159e6b4d1b7 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
@@ -40,7 +40,6 @@ std::vector<Isolate> getIsolates(const std::string& isolates_unparsed);
 std::vector<std::string> getNoFolds(::intel_npu::Config& cfg);
 std::vector<std::string> getNoFolds(const std::string& nofolds_unparsed);
 // Set default predefined values for COMPUTE pipeline
-void setComputeConfig(PassContext& ctx);
 void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to);
 
 size_t getMinGraphSize(::intel_npu::Config& cfg) {
@@ -204,12 +203,6 @@ std::vector<std::string> getNoFolds(const std::string& nofolds_unparsed) {
     return nofolds;
 }
 
-void setComputeConfig(PassContext& ctx) {
-    // FIXME: initialize via a dedicated function instead of parsing
-    ctx.isolates = detail::getIsolates(ISOL_PRESETS.at("COMPUTE"));
-    ctx.nofolds = detail::getNoFolds("compute");
-}
-
 void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
     pugi::xml_document doc;
 
@@ -277,10 +270,21 @@ class Compiler {
         NONE,    // Partitioning will consist of a single group with all the Ops
         INIT,    // Initialize only. The hardest mode, every group has just 1 layer inside
         JUST,    // "justParitioning" - combination of LHF + Remnants
-        REP,     // Repeated blocks pipeline - combination of repeatedBlocks and Remnants - default configuration
+        REP,     // Repeated blocks pipeline - combination of repeatedBlocks and Remnants
+        REG,     // Regularized repeated blocks pipeline -same as REP, but with some strong hints first
         COMPUTE  // Separates non-foldable compute subgraphs from the model based on predefined rules + REP
     };
 
+    template <class C>
+    void warn_unused() {
+        const auto& val = m_cfg.get<C>();
+        if (val != C::defaultValue()) {
+            LOG_WARN("User-specified configuration {" << C::key() << " : " << val
+                                                      << "} is ignored in the current pipeline "
+                                                      << m_cfg.get<::intel_npu::NPUW_ONLINE_PIPELINE>());
+        }
+    }
+
     Pipeline currentPipeline() {
         std::string pipeline_opt = m_cfg.getString<::intel_npu::NPUW_ONLINE_PIPELINE>();
         if (pipeline_opt == "NONE") {
@@ -291,6 +295,8 @@ class Compiler {
             return Pipeline::JUST;
         } else if (pipeline_opt == "REP") {
             return Pipeline::REP;
+        } else if (pipeline_opt == "REG") {
+            return Pipeline::REG;
         } else if (pipeline_opt == "COMPUTE") {
             return Pipeline::COMPUTE;
         } else {
@@ -346,6 +352,23 @@ class Compiler {
         LOG_INFO("Done");
     }
 
+    void reg() {
+        LOG_INFO("Online partitioning: compiling regularized repeated blocks pipeline...");
+        LOG_BLOCK();
+
+        m_snapshot->earlyAvoids();
+        m_snapshot->earlyRegroup();
+        m_snapshot->repeatedBlocks([&]() {
+            // This callback is called when repeatingBlocks algorithm thinks it is done
+            m_snapshot->stripTag("compute");
+        });
+        m_snapshot->repeat([&] {
+            m_snapshot->fuseRemnantsExtended();
+        });
+
+        LOG_INFO("Done");
+    }
+
 public:
     Compiler(const std::shared_ptr<ov::Model>& model, ::intel_npu::Config& cfg)
         : m_model(model),
@@ -384,9 +407,24 @@ class Compiler {
         case Pipeline::REP:
             rep();
             break;
+        case Pipeline::REG:
+            warn_unused<::intel_npu::NPUW_ONLINE_ISOLATE>();
+
+            // Only get isolates here.
+            // NB: We ignore NO_FOLD everywhere except pipeline COMPUTE - this needs
+            // to be aligned in the future
+            ctx.isolates = detail::getIsolates(detail::ISOL_PRESETS.at("COMPUTE"));
+            m_snapshot->setCtx(ctx);
+            reg();
+            break;
         case Pipeline::COMPUTE:
+            warn_unused<::intel_npu::NPUW_ONLINE_ISOLATE>();
+            warn_unused<::intel_npu::NPUW_ONLINE_NO_FOLD>();
+
             // Manually set predefined isolates and nofolds then do rep() pipeline
-            detail::setComputeConfig(ctx);
+            // FIXME: initialize via a dedicated function instead of parsing
+            ctx.isolates = detail::getIsolates(detail::ISOL_PRESETS.at("COMPUTE"));
+            ctx.nofolds = detail::getNoFolds("compute");
             m_snapshot->setCtx(ctx);
             rep();
             break;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
index 991330663bbe48..cfa9e451ffb149 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
@@ -443,6 +443,10 @@ void Group::isolate(const std::string& tag) {
     m_isol_tag = tag;
 }
 
+void Group::dontIsolate() {
+    m_isol_tag = "";
+}
+
 const std::string& Group::isolatedTag() const {
     return m_isol_tag;
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp
index 69688248a0b9ac..538eeb03bc851c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp
@@ -77,6 +77,7 @@ class Group : public std::enable_shared_from_this<Group> {
     // FIXME: unify avoid and isolate
     void avoid(const std::string& device);
     void isolate(const std::string& tag);
+    void dontIsolate();
     const std::set<std::string>& avoidedTargets() const;
     const std::string& isolatedTag() const;
     std::string specialTags() const;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
index 82856cece3de40..4cdc92ffc92d25 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
@@ -436,18 +436,27 @@ void Snapshot::earlyRegroup() {
     LOG_INFO("DONE.");
 }
 
-void Snapshot::repeatedBlocks() {
+void Snapshot::repeatedBlocks(Snapshot::CB&& on_done) {
     LOG_INFO("Online partitioning: executing repeatedBlocks pass group...");
     LOG_BLOCK();
 
     identifyUniques();
     repeat([&] {
         repeat([&] {
-            mergeUniques();
+            repeat([&] {
+                mergeUniques();
+            });
+            mergeTriangles();
+            markInternalCompute();
+            resetExcludedRep();
         });
-        mergeTriangles();
-        markInternalCompute();
-        resetExcludedRep();
+        // While the current process is entirely done, let the caller
+        // influence the partitioning - so the algorithm could continue.
+        if (on_done) {
+            on_done();
+        } else {
+            return;  // FROM top-level repeat!
+        }
     });
     cleanUpUniques();
 
@@ -1086,3 +1095,12 @@ void Snapshot::repeat(detail::Pass&& pass) {
 void Snapshot::setCtx(const ov::npuw::online::PassContext& ctx) {
     m_ctx = ctx;
 }
+
+void Snapshot::stripTag(const std::string& tag) {
+    for (auto&& nh : m_graph->nodes()) {
+        auto gptr = m_graph->meta(nh).get<Group::GPtr>();
+        if (gptr->isolatedTag() == tag) {
+            gptr->dontIsolate();
+        }
+    }
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp
index e7e5121b1240e7..6da1a6d98939bb 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp
@@ -46,10 +46,13 @@ class Snapshot : public std::enable_shared_from_this<Snapshot> {
     void fuseInputs();
 
     // Advanced passes for repeated blocks algorithm
-    void repeatedBlocks();
+    using CB = std::function<void()>;
+    void repeatedBlocks(CB&& on_done = {});
     void earlyAvoids();
     void earlyRegroup();
 
+    void stripTag(const std::string& tag);
+
     // Utility
     std::shared_ptr<own::ade::Graph> getGraph() const;
     const detail::OVPortsMap& getPortsMap() const;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
index c9a162421fe243..077fb6d6660132 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
@@ -335,7 +335,7 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) {
     auto qcvtw = opp::wrap_type<ov::op::v0::Convert>({qweight});
     auto qmuls = opp::wrap_type<ov::op::v1::Multiply>({qcvtw, qcoeff});
     auto qreshp = opp::wrap_type<ov::op::v1::Reshape>({qmuls, opp::any_input()});
-    auto qcvtr = opp::wrap_type<ov::op::v0::Convert>({qreshp});
+    auto qcvtr = opp::optional<ov::op::v0::Convert>({qreshp->output(0)});
     auto qmmi = opp::any_input();
     auto qmm = opp::wrap_type<ov::op::v0::MatMul>({qmmi, qcvtr});
 
@@ -409,13 +409,18 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) {
             auto rshp_ccat = std::make_shared<ov::op::v1::Reshape>(scaled, rshp_ccat_c, false);
 
             auto reduce_axis = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 1);
-            auto reduce = std::make_shared<ov::op::v1::ReduceSum>(rshp_ccat, reduce_axis, true);
+            // Make reduceSum not to keep axis because then it will convert to poolings in compiler.
+            // Otherwise reduceSum will convert to the convolution which is less efficient than poolings.
+            auto reduce = std::make_shared<ov::op::v1::ReduceSum>(rshp_ccat, reduce_axis, false);
 
             auto rshp_out_c = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, out_shape);
             auto rshp_out = std::make_shared<ov::op::v1::Reshape>(reduce, rshp_out_c, false);
 
-            // Convert the result to f32 to maintain the graph contracts. FIXME should be avoided
-            auto out = std::make_shared<ov::op::v0::Convert>(rshp_out, ov::element::f32);
+            // Convert the result to f32 to maintain the graph contracts if required.
+            std::shared_ptr<ov::Node> out = rshp_out;
+            if (matched_matmul->get_element_type() == ov::element::f32) {
+                out = std::make_shared<ov::op::v0::Convert>(rshp_out, ov::element::f32);
+            }
 
             // Now.. Reconnect the matmul readers to the new output (reducesum)
             for (auto&& r : matched_matmul->output(0).get_target_inputs()) {
@@ -690,11 +695,6 @@ DQParMMGQ::DQParMMGQ(Context::Ref ctx) {
             return false;
         }
 
-        if (qmmi_shape[1] != 1 && !ctx.get().is_spatial) {
-            // For non 1-token cases, do transformation if and only if and only if the block is spatial
-            return false;
-        }
-
         if (!matmul->get_transpose_a() && !matmul->get_transpose_b()) {
             ctx.get().register_parallel_matmul(node_to_output.at(qmmi), 2, Context::DQParMM{w_param, s_param, matmul});
         } else if (!matmul->get_transpose_a() && matmul->get_transpose_b()) {
@@ -752,7 +752,7 @@ void mergeParallelMatMuls(const std::shared_ptr<ov::Model>& m, Context& ctx) {
         auto new_cvt = std::make_shared<ov::op::v0::Convert>(new_w, new_s->get_element_type());
 
         std::shared_ptr<ov::Node> new_mul = std::make_shared<ov::op::v1::Multiply>(new_cvt, new_s);
-        if (new_s->get_element_type() == ov::element::f16) {
+        if ((new_s->get_element_type() == ov::element::f16) && (orig_multiply.get_element_type() == ov::element::f32)) {
             new_mul = std::make_shared<ov::op::v0::Convert>(new_mul, ov::element::f32);
         }
         auto new_w_shape = new_w->get_shape();
diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp
index 093e3235afb78f..b55d39bead49bb 100644
--- a/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp
+++ b/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp
@@ -489,30 +489,24 @@ TEST_F(BehaviorTestsNPUWOnlinePartitioning, FoldingAndPipelining) {
         EXPECT_COMPILE_MODEL(mock_cpu, TIMES(0));
     }
 
-    for (int i = 0;  i < 3; i++) {
-        // Here we will create 2 infer requests per model,
-        // so `create_sync_infer_request()` should be called twice
-        // per model:
-        EXPECT_CREATE_SYNC_INFER_REQ(mock_npu, MODEL(i), TIMES(2));
-    }
-
-    // 1st model 1st infer request is called once -- head
-    EXPECT_INFER_FOR(mock_npu, MODEL(0), INFER_REQ(0), TIMES(1));
-    // 1st model 2nd infer request is never called,
-    // it is not a function and is not repeated
-    EXPECT_INFER_FOR(mock_npu, MODEL(0), INFER_REQ(1), TIMES(0)); 
+    // 1 infer request for head:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu, MODEL(0), TIMES(1));  
+    // 2 infer requests for function, `create_sync_infer_request()`
+    // should be called twice here:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu, MODEL(1), TIMES(2));
+    // 1 infer request for tail:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu, MODEL(2), TIMES(1));
+
+    // Head's infer request is called once:
+    EXPECT_INFER(mock_npu, MODEL(0), TIMES(1));
 
-    // Repeated block
-    // 2nd model 1st infer request is called 5 times
+    // Repeated block's model 1st infer request is called 5 times:
     EXPECT_INFER_FOR(mock_npu, MODEL(1), INFER_REQ(0), TIMES(5));
-    // 2nd model 2nd infer request (brother of 1st one) is called 5 times
+    // Repeated block's model 2nd infer request (brother of 1st one) is called 5 times:
     EXPECT_INFER_FOR(mock_npu, MODEL(1), INFER_REQ(1), TIMES(5));
 
-    // 3rd model 1st infer request is called once -- tail
-    EXPECT_INFER_FOR(mock_npu, MODEL(2), INFER_REQ(0), TIMES(1));
-    // 3rd model 2nd infer request is never called,
-    // it is not a function and is not repeated
-    EXPECT_INFER_FOR(mock_npu, MODEL(2), INFER_REQ(1), TIMES(0));
+    // Tail's infer request is called once:
+    EXPECT_INFER(mock_npu, MODEL(2), TIMES(1));
 
     // Register mock objects as plugins in OpenVINO:
     register_mock_plugins_in_ov();
diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.cpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.cpp
index ed4bf72a945f79..950d80b279324f 100644
--- a/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.cpp
+++ b/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.cpp
@@ -243,9 +243,10 @@ void MockPluginBase<DeviceType>::create_implementation() {
             .WillByDefault([](const ov::AnyMap& remote_properties) -> ov::SoPtr<ov::IRemoteContext> {
                 OPENVINO_NOT_IMPLEMENTED;
             });
+    // This method is utilized for remote tensor allocation in NPUW JustInferRequest and Weight bank.
     ON_CALL(*this, get_default_context)
             .WillByDefault([](const ov::AnyMap& remote_properties) -> ov::SoPtr<ov::IRemoteContext> {
-                OPENVINO_NOT_IMPLEMENTED;
+                return std::make_shared<MockRemoteContext>(device_name);
             });
     ON_CALL(*this, import_model(testing::_, testing::_))
             .WillByDefault([](std::istream& model, const ov::AnyMap& properties)
diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.hpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.hpp
index 4d720796c6abbf..e8f9e134fcb324 100644
--- a/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.hpp
+++ b/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.hpp
@@ -23,6 +23,23 @@ namespace ov {
 namespace npuw {
 namespace tests {
 
+// Need for remote tensor allocation in NPUW JustInferRequest and Weight bank.
+// They utilize "create_host_tensor()" method.
+// TODO: Mock "create_host_tensor()" method and add tests for it.
+class MockRemoteContext : public ov::IRemoteContext {
+    std::string m_name;
+
+public:
+    MockRemoteContext(std::string name) : m_name(std::move(name)) {}
+    const std::string& get_device_name() const override {
+        return m_name;
+    }
+    MOCK_METHOD(ov::SoPtr<ov::IRemoteTensor>,
+                create_tensor,
+                (const ov::element::Type&, const ov::Shape&, const ov::AnyMap&));
+    MOCK_METHOD(const ov::AnyMap&, get_property, (), (const));
+};
+
 class MockCompiledModelBase;
 using MockCompiledModel = testing::NiceMock<MockCompiledModelBase>;
 
diff --git a/src/plugins/template/tests/functional/op_reference/proposal.cpp b/src/plugins/template/tests/functional/op_reference/proposal.cpp
index aa49a6b7330166..435a279588af07 100644
--- a/src/plugins/template/tests/functional/op_reference/proposal.cpp
+++ b/src/plugins/template/tests/functional/op_reference/proposal.cpp
@@ -88,10 +88,6 @@ struct ProposalV4Params {
                      const int feature_stride,
                      const int pre_nms_topn,
                      const int post_nms_topn,
-                     const size_t image_shape_num,
-                     const size_t image_h,
-                     const size_t image_w,
-                     const size_t image_z,
                      const std::vector<float>& ratios,
                      const std::vector<float>& scales,
                      const size_t batch_size,
@@ -101,19 +97,22 @@ struct ProposalV4Params {
                      const ov::element::Type& iType,
                      const std::vector<IT>& clsScoreValues,
                      const std::vector<IT>& bboxPredValues,
+                     const std::vector<IT>& inputInfoValues,
                      const std::vector<IT>& proposalValues,
                      const std::vector<IT>& probsValues,
+                     const std::string& framework,
                      const std::string& test_name = "")
         : inType(iType),
           outType(iType),
           clsScoreData(CreateTensor(iType, clsScoreValues)),
           bboxPredData(CreateTensor(iType, bboxPredValues)),
+          imageInfoData(CreateTensor(iType, inputInfoValues)),
           refProposalData(CreateTensor(Shape{batch_size * post_nms_topn, 5}, iType, proposalValues)),
           refProbsData(CreateTensor(Shape{batch_size * post_nms_topn}, iType, probsValues)),
           testcaseName(test_name) {
         clsScoreShape = Shape{batch_size, anchor_num * 2, feat_map_height, feat_map_width};
         bboxPredShape = Shape{batch_size, anchor_num * 4, feat_map_height, feat_map_width};
-        imageShapeShape = Shape{image_shape_num};
+        imageInfoShape = Shape{inputInfoValues.size()};
 
         attrs.base_size = min_bbox_size;
         attrs.min_size = min_bbox_size;
@@ -129,25 +128,19 @@ struct ProposalV4Params {
         attrs.normalize = false;
         attrs.box_size_scale = 1.0f;
         attrs.box_coordinate_scale = 1.0f;
-        attrs.framework = "";
+        attrs.framework = framework;
         attrs.infer_probs = true;
-
-        std::vector<IT> inputShapeValues;
-        inputShapeValues.push_back(static_cast<IT>(image_h));
-        inputShapeValues.push_back(static_cast<IT>(image_w));
-        inputShapeValues.push_back(static_cast<IT>(image_z));
-        imageShapeData = CreateTensor(iType, inputShapeValues);
     }
 
     ov::op::v4::Proposal::Attributes attrs;
     ov::PartialShape clsScoreShape;
     ov::PartialShape bboxPredShape;
-    ov::PartialShape imageShapeShape;
+    ov::PartialShape imageInfoShape;
     ov::element::Type inType;
     ov::element::Type outType;
     ov::Tensor clsScoreData;
     ov::Tensor bboxPredData;
-    ov::Tensor imageShapeData;
+    ov::Tensor imageInfoData;
     ov::Tensor refProposalData;
     ov::Tensor refProbsData;
     std::string testcaseName;
@@ -192,7 +185,7 @@ class ReferenceProposalV4LayerTest : public testing::TestWithParam<ProposalV4Par
     void SetUp() override {
         const auto& params = GetParam();
         function = CreateFunction(params);
-        inputData = {params.clsScoreData, params.bboxPredData, params.imageShapeData};
+        inputData = {params.clsScoreData, params.bboxPredData, params.imageInfoData};
         refOutData = {params.refProposalData, params.refProbsData};
     }
     static std::string getTestCaseName(const testing::TestParamInfo<ProposalV4Params>& obj) {
@@ -200,9 +193,11 @@ class ReferenceProposalV4LayerTest : public testing::TestWithParam<ProposalV4Par
         std::ostringstream result;
         result << "clsScoreShape=" << param.clsScoreShape << "_";
         result << "bboxPredShape=" << param.bboxPredShape << "_";
-        result << "imageShapeShape=" << param.imageShapeShape << "_";
+        result << "imageInfoShape=" << param.imageInfoShape << "_";
         result << "iType=" << param.inType << "_";
         result << "oType=" << param.outType;
+        if (!param.attrs.framework.empty())
+            result << "_" << param.attrs.framework;
         if (!param.testcaseName.empty())
             result << "_" << param.testcaseName;
 
@@ -213,7 +208,7 @@ class ReferenceProposalV4LayerTest : public testing::TestWithParam<ProposalV4Par
     static std::shared_ptr<Model> CreateFunction(const ProposalV4Params& params) {
         const auto class_probs_param = std::make_shared<op::v0::Parameter>(params.inType, params.clsScoreShape);
         const auto bbox_deltas_param = std::make_shared<op::v0::Parameter>(params.inType, params.bboxPredShape);
-        const auto image_shape_param = std::make_shared<op::v0::Parameter>(params.inType, params.imageShapeShape);
+        const auto image_shape_param = std::make_shared<op::v0::Parameter>(params.inType, params.imageInfoShape);
         const auto Proposal =
             std::make_shared<op::v4::Proposal>(class_probs_param, bbox_deltas_param, image_shape_param, params.attrs);
         return std::make_shared<ov::Model>(Proposal->outputs(),
@@ -235,21 +230,21 @@ std::vector<ProposalV1Params> generateProposalV1Params() {
 
     std::vector<ProposalV1Params> proposalV1Params{
         ProposalV1Params(
-            0.7f,
-            16,
-            16,
-            6000,
-            10,  // iou_threshold, min_nnox_size, feature_stride,pre_nms_topn, post_nms_topn
-            3,
-            210,
-            350,
-            1,        // image_shape_num, image_h, image_w, image_z
+            0.7f,     // iou_threshold
+            16,       // min_nnox_size
+            16,       // feature_stride
+            6000,     // pre_nms_topn
+            10,       // post_nms_topn
+            3,        // image_shape_num
+            210,      // image_h
+            350,      // image_w
+            1,        // image_z
             {0.5f},   // ratios
             {32.0f},  // scales
-            1,
-            1,
-            10,
-            10,  // batch_size, anchor_num, feat_map_height, feat_map_width
+            1,        // batch_size
+            1,        // anchor_num
+            10,       // feat_map_height
+            10,       // feat_map_width
             IN_ET,
             std::vector<T>{
                 0.000240f, 0.003802f, 0.111432f, 0.000503f, 0.007887f, 0.144701f, 0.399074f, 0.004680f,  // 0
@@ -351,22 +346,18 @@ std::vector<ProposalV4Params> generateProposalV4Params() {
     using T = typename element_type_traits<IN_ET>::value_type;
 
     std::vector<ProposalV4Params> proposalV4Params{
-        ProposalV4Params(
-            0.7f,
-            16,
-            16,
-            6000,
-            10,  // iou_threshold, min_nnox_size, feature_stride,pre_nms_topn, post_nms_topn
-            3,
-            210,
-            350,
-            1,        // image_shape_num, image_h, image_w, image_z
+        ProposalV4Params{
+            0.7f,     // iou_threshold
+            16,       // min_bbox_size
+            16,       // feature_stride
+            6000,     // pre_nms_topn
+            10,       // post_nms_topn
             {0.5f},   // ratios
             {32.0f},  // scales
-            1,
-            1,
-            10,
-            10,  // batch_size, anchor_num, feat_map_height, feat_map_width
+            1,        // batch_size
+            1,        // anchor_num
+            10,       // feat_map_height
+            10,       // feat_map_width
             IN_ET,
             std::vector<T>{
                 0.000240f, 0.003802f, 0.111432f, 0.000503f, 0.007887f, 0.144701f, 0.399074f, 0.004680f,  // 0
@@ -447,6 +438,7 @@ std::vector<ProposalV4Params> generateProposalV4Params() {
                 0.026623f,  0.117951f,  -0.076234f, -0.811997f, 0.01301f,   0.020042f,  0.173756f,  -0.036191f,
                 -0.068887f, 0.0229f,    0.245465f,  0.214282f,  -0.011054f, 0.132813f,  0.241014f,  -0.148763f,
             },
+            std::vector<T>{210, 350, 1},
             std::vector<T>{
                 0.000000f, 0.000000f,   0.000000f,  349.000000f, 209.000000f,  // 0
                 0.000000f, 0.000000f,   0.000000f,  237.625443f, 209.000000f,  // 5
@@ -470,36 +462,135 @@ std::vector<ProposalV4Params> generateProposalV4Params() {
                 0.0008570f,
                 0.0002190f,
                 0.0000000f,
-            }),
+            },
+            ""},
+        ProposalV4Params{
+            0.7f,     // iou_threshold
+            16,       // min_bbox_size
+            16,       // feature_stride
+            6000,     // pre_nms_topn
+            10,       // post_nms_topn
+            {0.5f},   // ratios
+            {32.0f},  // scales
+            1,        // batch_size
+            1,        // anchor_num
+            10,       // feat_map_height
+            10,       // feat_map_width
+            IN_ET,
+            std::vector<T>{
+                0.000240f, 0.003802f, 0.111432f, 0.000503f, 0.007887f, 0.144701f, 0.399074f, 0.004680f,  // 0
+                0.139741f, 0.002386f, 0.030003f, 0.276552f, 0.000267f, 0.022971f, 0.287953f, 0.050235f,  // 8
+                0.002580f, 0.206311f, 0.000146f, 0.009656f, 0.175462f, 0.000147f, 0.014718f, 0.272348f,  // 16
+                0.065199f, 0.003286f, 0.185335f, 0.003720f, 0.025932f, 0.251401f, 0.001465f, 0.090447f,  // 24
+                0.488469f, 0.092259f, 0.019306f, 0.379091f, 0.005311f, 0.010369f, 0.087615f, 0.042003f,  // 32
+                0.073871f, 0.416763f, 0.044282f, 0.069776f, 0.313032f, 0.000457f, 0.017346f, 0.089762f,  // 40
+                0.000820f, 0.103986f, 0.367993f, 0.026315f, 0.035701f, 0.299252f, 0.000135f, 0.017825f,  // 48
+                0.150119f, 0.000076f, 0.050511f, 0.269601f, 0.026680f, 0.003541f, 0.189765f, 0.000051f,  // 56
+                0.004315f, 0.193150f, 0.000032f, 0.007254f, 0.185557f, 0.051526f, 0.000657f, 0.117579f,  // 64
+                0.000115f, 0.010179f, 0.293187f, 0.000025f, 0.006505f, 0.175345f, 0.032587f, 0.000469f,  // 72
+                0.098443f, 0.000121f, 0.009600f, 0.322782f, 0.000032f, 0.004543f, 0.166860f, 0.044911f,  // 80
+                0.000187f, 0.102691f, 0.000242f, 0.005502f, 0.107865f, 0.000191f, 0.005336f, 0.086893f,  // 88
+                0.078422f, 0.000345f, 0.079096f, 0.000281f, 0.016388f, 0.214072f, 0.000107f, 0.012027f,  // 96
+                0.192754f, 0.049531f, 0.000386f, 0.149893f, 0.000374f, 0.016965f, 0.204781f, 0.000163f,  // 104
+                0.016272f, 0.215277f, 0.032298f, 0.000857f, 0.133426f, 0.000614f, 0.020215f, 0.165789f,  // 112
+                0.000225f, 0.036951f, 0.262195f, 0.087675f, 0.004596f, 0.147764f, 0.000219f, 0.010502f,  // 120
+                0.163394f, 0.000152f, 0.023116f, 0.241702f, 0.081800f, 0.002197f, 0.146637f, 0.000193f,  // 128
+                0.012017f, 0.133497f, 0.000375f, 0.028605f, 0.309179f, 0.065962f, 0.005508f, 0.155530f,  // 136
+                0.000186f, 0.004540f, 0.079319f, 0.000799f, 0.031003f, 0.303045f, 0.051473f, 0.017770f,  // 144
+                0.206188f, 0.000202f, 0.004291f, 0.061095f, 0.001109f, 0.018094f, 0.156639f, 0.026062f,  // 152
+                0.005270f, 0.148651f, 0.000026f, 0.007300f, 0.096013f, 0.000383f, 0.022134f, 0.129511f,  // 160
+                0.080882f, 0.003416f, 0.129922f, 0.000037f, 0.010040f, 0.130007f, 0.000116f, 0.014904f,  // 168
+                0.171423f, 0.082893f, 0.000921f, 0.154976f, 0.000142f, 0.016552f, 0.209696f, 0.000227f,  // 176
+                0.022418f, 0.228501f, 0.111712f, 0.001987f, 0.158164f, 0.001200f, 0.027049f, 0.308222f,  // 184
+                0.001366f, 0.038146f, 0.287945f, 0.072526f, 0.016064f, 0.257895f, 0.000595f, 0.016962f,  // 192
+            },
+            std::vector<T>{
+                0.006756f,  -0.055635f, 0.030843f,  0.007482f,  0.009056f,  -0.041824f, 0.119722f,  0.168988f,
+                0.002822f,  0.039733f,  0.109005f,  0.245152f,  -0.013196f, -0.018222f, -0.170122f, -0.374904f,
+                -0.005455f, -0.034059f, -0.006787f, 0.072005f,  -0.017933f, -0.007358f, 0.034149f,  0.123846f,
+                0.128319f,  0.016107f,  -0.615487f, -1.235094f, -0.024253f, -0.019406f, 0.134142f,  0.157853f,
+                -0.021119f, 0.007383f,  0.089365f,  0.092854f,  0.062491f,  0.002366f,  0.122464f,  -0.003326f,
+                0.015468f,  -0.034088f, 0.079009f,  0.075483f,  0.011972f,  0.042427f,  0.106865f,  0.158754f,
+                0.071211f,  -0.034009f, 0.007985f,  -0.441477f, 0.009046f,  -0.028515f, 0.095372f,  0.119598f,
+                -0.007553f, -0.0072f,   0.105072f,  0.084314f,  0.23268f,   -0.02906f,  -0.408454f, -1.13439f,
+                0.016202f,  -0.037859f, 0.130873f,  0.129652f,  0.002064f,  -0.011969f, 0.171623f,  0.050218f,
+                0.113831f,  0.028922f,  0.017785f,  0.059708f,  0.037658f,  -0.011245f, 0.097197f,  0.137491f,
+                0.024218f,  0.04739f,   0.091978f,  0.217333f,  0.088418f,  -0.004662f, -0.095168f, -0.397928f,
+                0.02639f,   -0.008501f, 0.068487f,  0.108465f,  0.020069f,  0.018829f,  0.040206f,  0.068473f,
+                0.226458f,  -0.072871f, -0.672384f, -1.447558f, 0.039598f,  0.017471f,  0.187288f,  0.08409f,
+                0.017152f,  -0.00516f,  0.183419f,  0.068469f,  0.063944f,  0.160725f,  -0.022493f, -0.132291f,
+                0.010542f,  0.036318f,  0.074042f,  -0.013323f, 0.00808f,   0.060365f,  0.120566f,  0.21866f,
+                0.046324f,  0.088741f,  0.029469f,  -0.517183f, 0.00917f,   0.011915f,  0.053674f,  0.140168f,
+                0.0033f,    0.022759f,  -0.006196f, 0.063839f,  0.083726f,  -0.088385f, -0.57208f,  -1.454211f,
+                0.020655f,  0.010788f,  0.134951f,  0.109709f,  0.015445f,  -0.015363f, 0.109153f,  0.051209f,
+                0.024297f,  0.139126f,  -0.12358f,  -0.127979f, 0.004587f,  0.004751f,  0.047292f,  0.027066f,
+                0.011003f,  0.069887f,  0.117052f,  0.267419f,  0.039306f,  0.077584f,  0.02579f,   -0.496149f,
+                -0.005569f, 0.015494f,  -0.011662f, 0.105549f,  -0.007015f, 0.031984f,  -0.075742f, 0.0852f,
+                0.023886f,  -0.053107f, -0.325533f, -1.329066f, 0.004688f,  0.034501f,  0.089317f,  0.042463f,
+                0.004212f,  -0.015128f, 0.00892f,   0.028266f,  0.009997f,  0.157822f,  0.020116f,  -0.142337f,
+                0.008199f,  0.046564f,  0.083014f,  0.046307f,  0.006771f,  0.084997f,  0.141935f,  0.228339f,
+                -0.020308f, 0.077745f,  -0.018319f, -0.522311f, 0.010432f,  0.024641f,  0.020571f,  0.097148f,
+                0.002064f,  0.035053f,  -0.121995f, 0.012222f,  -0.030779f, 0.100481f,  -0.331737f, -1.257669f,
+                -0.013079f, 0.021227f,  0.159949f,  0.120097f,  0.005765f,  -0.012335f, -0.005268f, 0.042067f,
+                -0.043972f, 0.102556f,  0.180494f,  -0.084721f, -0.011962f, 0.031302f,  0.112511f,  0.027557f,
+                -0.002085f, 0.082978f,  0.149409f,  0.195091f,  -0.033731f, 0.019861f,  -0.064047f, -0.471328f,
+                -0.004093f, 0.016803f,  0.044635f,  0.058912f,  -0.018735f, 0.035536f,  -0.050373f, -0.002794f,
+                -0.086705f, 0.038435f,  -0.301466f, -1.071246f, -0.028247f, 0.018984f,  0.254702f,  0.141142f,
+                -0.017522f, 0.014843f,  0.079391f,  0.079662f,  -0.051204f, 0.048419f,  0.235604f,  -0.185797f,
+                -0.019569f, 0.02678f,   0.162507f,  0.046435f,  -0.004606f, 0.08806f,   0.18634f,   0.193957f,
+                -0.024333f, -0.01298f,  -0.17977f,  -0.65881f,  -0.003778f, 0.007418f,  0.065439f,  0.104549f,
+                -0.027706f, 0.03301f,   0.057492f,  0.032019f,  -0.135337f, 0.000269f,  -0.250203f, -1.181688f,
+                -0.027022f, -0.006755f, 0.206848f,  0.129268f,  -0.003529f, 0.013445f,  0.181484f,  0.139955f,
+                -0.036587f, 0.065824f,  0.288751f,  -0.110813f, -0.015578f, 0.044818f,  0.17756f,   0.006914f,
+                0.002329f,  0.068982f,  0.189079f,  0.184253f,  0.00301f,   -0.039168f, -0.010855f, -0.393254f,
+                0.000028f,  0.001906f,  0.07217f,   0.063305f,  -0.026144f, 0.028842f,  0.139149f,  0.023377f,
+                0.023362f,  0.023559f,  -0.145386f, -0.863572f, -0.015749f, -0.021364f, 0.172571f,  0.078393f,
+                -0.037253f, 0.014978f,  0.221502f,  0.189111f,  -0.048956f, 0.085409f,  0.325399f,  -0.058294f,
+                -0.028495f, 0.021663f,  0.19392f,   0.02706f,   0.006908f,  0.065751f,  0.176395f,  0.138375f,
+                0.012418f,  -0.031228f, -0.008762f, -0.427345f, -0.013677f, -0.002429f, 0.069655f,  0.019505f,
+                -0.036763f, 0.022528f,  0.201062f,  0.022205f,  0.024528f,  0.06241f,   -0.076237f, -0.840695f,
+                -0.007268f, -0.027865f, 0.211056f,  0.074744f,  -0.053563f, 0.006863f,  0.301432f,  0.192879f,
+                -0.021944f, 0.100535f,  0.19031f,   -0.133746f, -0.006151f, 0.023944f,  0.13561f,   -0.03259f,
+                0.000618f,  0.063736f,  0.180904f,  0.12393f,   0.001275f,  -0.0306f,   -0.032822f, -0.496515f,
+                0.009757f,  0.014602f,  0.004532f,  -0.039969f, -0.015984f, 0.047726f,  0.099865f,  0.003163f,
+                0.026623f,  0.117951f,  -0.076234f, -0.811997f, 0.01301f,   0.020042f,  0.173756f,  -0.036191f,
+                -0.068887f, 0.0229f,    0.245465f,  0.214282f,  -0.011054f, 0.132813f,  0.241014f,  -0.148763f,
+            },
+            std::vector<T>{210, 350, 1, 1},
+            std::vector<T>{0.f,      11.9688f, 4.02532f, 204.528f, 182.586f, 0.f,      33.7915f, 48.4886f, 210.f,
+                           238.505f, 0.f,      0.f,      0.f,      204.428f, 337.029f, 0.f,      72.611f,  9.87545f,
+                           203.687f, 212.299f, 0.f,      5.08432f, 4.19913f, 208.719f, 249.225f, 0.f,      23.6503f,
+                           57.8165f, 210.f,    350.f,    0.f,      84.8804f, 9.47241f, 156.822f, 243.003f, 0.f,
+                           101.663f, 15.5542f, 166.083f, 327.839f, 0.f,      13.9738f, 0.f,      210.f,    128.482f,
+                           0.f,      77.8929f, 29.663f,  186.561f, 313.287f
+
+            },
+            std::vector<
+                T>{0.309179, 0.308222, 0.303045, 0.241702, 0.192754, 0.165789, 0.15553, 0.154976, 0.146637, 0.129511},
+            "tensorflow"},
     };
     return proposalV4Params;
 }
 
 std::vector<ProposalV1Params> generateProposalV1CombinedParams() {
-    const std::vector<std::vector<ProposalV1Params>> proposalTypeParams{
-        generateProposalV1Params<element::Type_t::f64>(),
-        generateProposalV1Params<element::Type_t::f32>(),
-        generateProposalV1Params<element::Type_t::f16>(),
-        generateProposalV1Params<element::Type_t::bf16>()};
+    std::vector<std::vector<ProposalV1Params>> proposalTypeParams{generateProposalV1Params<element::Type_t::f64>(),
+                                                                  generateProposalV1Params<element::Type_t::f32>(),
+                                                                  generateProposalV1Params<element::Type_t::f16>(),
+                                                                  generateProposalV1Params<element::Type_t::bf16>()};
     std::vector<ProposalV1Params> combinedParams;
-
-    for (const auto& params : proposalTypeParams) {
-        combinedParams.insert(combinedParams.end(), params.begin(), params.end());
-    }
+    for (auto& params : proposalTypeParams)
+        std::move(params.begin(), params.end(), std::back_inserter(combinedParams));
     return combinedParams;
 }
 
 std::vector<ProposalV4Params> generateProposalV4CombinedParams() {
-    const std::vector<std::vector<ProposalV4Params>> proposalTypeParams{
-        generateProposalV4Params<element::Type_t::f64>(),
-        generateProposalV4Params<element::Type_t::f32>(),
-        generateProposalV4Params<element::Type_t::f16>(),
-        generateProposalV4Params<element::Type_t::bf16>()};
+    std::vector<std::vector<ProposalV4Params>> proposalTypeParams{generateProposalV4Params<element::Type_t::f64>(),
+                                                                  generateProposalV4Params<element::Type_t::f32>(),
+                                                                  generateProposalV4Params<element::Type_t::f16>(),
+                                                                  generateProposalV4Params<element::Type_t::bf16>()};
     std::vector<ProposalV4Params> combinedParams;
-
-    for (const auto& params : proposalTypeParams) {
-        combinedParams.insert(combinedParams.end(), params.begin(), params.end());
-    }
+    for (auto& params : proposalTypeParams)
+        std::move(params.begin(), params.end(), std::back_inserter(combinedParams));
     return combinedParams;
 }
 
diff --git a/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp b/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp
new file mode 100644
index 00000000000000..8f9687b7b93b2a
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp
@@ -0,0 +1,23 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/subgraph/lora_pattern.hpp"
+
+namespace ov {
+namespace test {
+
+TEST_P(LoraPatternMatmul, empty_tensors) {
+    targetStaticShapes = {{{{1, 20, K}}, {{N, K}}}};
+    run_test_empty_tensors();
+}
+
+TEST_P(LoraPatternConvolution, empty_tensors) {
+    targetStaticShapes = {{{1, num_channels, 64, 64}}};
+    run_test_empty_tensors();
+}
+
+}  // namespace test
+}  // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/subgraph_tests/rotary_pos_emb.hpp b/src/tests/functional/plugin/shared/include/subgraph_tests/rotary_pos_emb.hpp
index f2b19a6748a6a7..7100ddca1083e3 100644
--- a/src/tests/functional/plugin/shared/include/subgraph_tests/rotary_pos_emb.hpp
+++ b/src/tests/functional/plugin/shared/include/subgraph_tests/rotary_pos_emb.hpp
@@ -87,5 +87,12 @@ TEST_P(RoPETestGPTJSlice, CompareWithRefs) {
     CheckNumberOfNodesWithType(function, {"RoPE"}, 1);
 };
 
+TEST_P(RoPETestChatGLM2DRoPEStridedSlice, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED();
+    run();
+    auto function = compiledModel.get_runtime_model();
+    CheckNumberOfNodesWithType(function, {"RoPE"}, 1);
+};
+
 }  // namespace test
 }  // namespace ov
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp
new file mode 100644
index 00000000000000..16764d37dcf688
--- /dev/null
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+namespace ov {
+namespace test {
+
+class LoraPatternBase : public SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<const char*>& obj);
+
+protected:
+    void run_test_empty_tensors();
+
+protected:
+    static constexpr auto t4_name = "lora/MatMul.B";
+    static constexpr auto t5_name = "lora/MatMul.alpha";
+    static constexpr auto t6_name = "lora/MatMul.A";
+    static constexpr auto netType = ov::element::f32;
+};
+
+class LoraPatternMatmul : public LoraPatternBase, public testing::WithParamInterface<const char*> {
+public:
+    void SetUp() override;
+
+protected:
+    static constexpr size_t K = 563ul; // Weights matrix K dimension
+    static constexpr size_t N = 2048ul; // Weights matrix N dimension
+};
+
+class LoraPatternConvolution : public LoraPatternBase, public testing::WithParamInterface<const char*> {
+public:
+    void SetUp() override;
+
+protected:
+    static constexpr size_t num_channels = 320ul;
+};
+
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp
index 2663a6f5ad3fab..e1182bd3b16e13 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp
@@ -115,5 +115,17 @@ class RoPETestGPTJSlice : public RoPETestGPTJStridedSlice {
     void SetUp() override;
 };
 
+class RoPETestChatGLM2DRoPEStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface<std::string> {
+private:
+    std::shared_ptr<ov::Model> buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims);
+protected:
+    ov::Tensor create_i32_tensor(const ov::Shape& shape, int start, int step = 1);
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
+    void SetUp() override;
+
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<std::string>& obj);
+};
+
 }  // namespace test
 }  // namespace ov
diff --git a/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp b/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp
new file mode 100644
index 00000000000000..6f74fd09b022a6
--- /dev/null
+++ b/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp
@@ -0,0 +1,143 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/subgraph/lora_pattern.hpp"
+
+#include "common_test_utils/node_builders/eltwise.hpp"
+#include "common_test_utils/node_builders/convolution.hpp"
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+namespace ov {
+namespace test {
+
+
+std::string LoraPatternBase::getTestCaseName(const testing::TestParamInfo<const char*>& obj) {
+    auto device_name = obj.param;
+    return std::string{"targetDevice="} + device_name; //NOLINT
+}
+
+constexpr ov::element::Type LoraPatternBase::netType; //redundant variable definition for C++ prior to C++17
+
+void LoraPatternBase::run_test_empty_tensors() {
+    compile_model();
+    inferRequest = compiledModel.create_infer_request();
+    ASSERT_TRUE(inferRequest);
+    generate_inputs(targetStaticShapes.front());
+    for (const auto& input : inputs) {
+        inferRequest.set_tensor(input.first, input.second);
+    }
+
+    inferRequest.infer();
+    auto outputs = function->outputs();
+
+    auto tx_result = inferRequest.get_tensor(outputs[0]);
+    auto tz_result = inferRequest.get_tensor(outputs[1]);
+    ov::test::utils::compare(tx_result, tz_result, 1e-4, 1e-4);
+}
+
+void LoraPatternMatmul::SetUp() {
+    targetDevice = this->GetParam();
+
+    ov::PartialShape shape_x = {-1, -1, K};
+    ov::PartialShape shape_w = {N, K};
+
+    auto param_y = std::make_shared<ov::op::v0::Parameter>(netType, shape_x);
+    auto param_w = std::make_shared<ov::op::v0::Parameter>(netType, shape_w);
+
+    // "Main" matrix multiplication from the original transformer model
+    auto tx = std::make_shared<ov::op::v0::MatMul>(param_y, param_w, false, true);
+
+    // LoRA parameters from states
+    auto variable_t4 = std::make_shared<ov::op::util::Variable>(
+        ov::op::util::VariableInfo{ov::PartialShape({N, -1}), netType, t4_name});
+    auto t4 = std::make_shared<ov::op::v6::ReadValue>(variable_t4);
+    auto t4_assign = std::make_shared<ov::op::v6::Assign>(t4, variable_t4);
+
+    auto variable_t5 = std::make_shared<ov::op::util::Variable>(
+        ov::op::util::VariableInfo{ov::PartialShape({1, -1}), netType, t5_name});
+    auto t5 = std::make_shared<ov::op::v6::ReadValue>(variable_t5);
+    auto t5_assign = std::make_shared<ov::op::v6::Assign>(t5, variable_t5);
+
+    auto variable_t6 = std::make_shared<ov::op::util::Variable>(
+        ov::op::util::VariableInfo{ov::PartialShape({-1, K}), netType, t6_name});
+    auto t6 = std::make_shared<ov::op::v6::ReadValue>(variable_t6);
+    auto t6_assign = std::make_shared<ov::op::v6::Assign>(t6, variable_t6);
+
+    // Apply LoRA parameters to the current activations
+    auto t5810 = std::make_shared<ov::op::v0::MatMul>(param_y, t6, false, true);
+    auto t5811 = std::make_shared<ov::op::v1::Multiply>(t5810, t5);
+    auto t5812 = std::make_shared<ov::op::v0::MatMul>(t5811, t4, false, true);
+
+    // Mix LoRA part into normally computed activations after the "main" MatMul
+    auto tz = std::make_shared<ov::op::v1::Add>(tx, t5812);
+
+    auto result_x = std::make_shared<ov::op::v0::Result>(tx);
+    auto result_z = std::make_shared<ov::op::v0::Result>(tz);
+
+    function = std::make_shared<ov::Model>(ov::ResultVector({result_x, result_z}),
+                                           ov::SinkVector({t4_assign, t5_assign, t6_assign}),
+                                           ov::ParameterVector({param_y, param_w}));
+}
+
+void LoraPatternConvolution::SetUp() {
+    targetDevice = this->GetParam();
+
+    ov::PartialShape shape_x = {-1, num_channels, -1, -1};
+
+    auto param_y = std::make_shared<ov::op::v0::Parameter>(netType, shape_x);
+
+    // Original Convolution that is modified by LoRA adapter later
+    auto tx = ov::test::utils::make_convolution(param_y,
+                                                netType,
+                                                {1, 1},
+                                                {1, 1},
+                                                {0, 0},
+                                                {0, 0},
+                                                {1, 1},
+                                                ov::op::PadType::EXPLICIT,
+                                                num_channels);
+
+    // LoRA parameters from states
+    auto variable_t4 = std::make_shared<ov::op::util::Variable>(
+        ov::op::util::VariableInfo{ov::PartialShape({num_channels, -1}), netType, t4_name});
+    auto t4 = std::make_shared<ov::op::v6::ReadValue>(variable_t4);
+    auto t4_assign = std::make_shared<ov::op::v6::Assign>(t4, variable_t4);
+
+    auto variable_t5 = std::make_shared<ov::op::util::Variable>(
+        ov::op::util::VariableInfo{ov::PartialShape({1, -1}), netType, t5_name});
+    auto t5 = std::make_shared<ov::op::v6::ReadValue>(variable_t5);
+    auto t5_assign = std::make_shared<ov::op::v6::Assign>(t5, variable_t5);
+
+    auto variable_t6 = std::make_shared<ov::op::util::Variable>(
+        ov::op::util::VariableInfo{ov::PartialShape({-1, num_channels}), netType, t6_name});
+    auto t6 = std::make_shared<ov::op::v6::ReadValue>(variable_t6);
+    auto t6_assign = std::make_shared<ov::op::v6::Assign>(t6, variable_t6);
+
+    // LoRA pattern with additional Transposes to move channel dimensions into positions where MatMul can be applied
+    auto t4940 =
+        std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<size_t>{2, 3, 0, 1});
+
+    auto t4941 = std::make_shared<ov::op::v1::Transpose>(param_y, t4940);
+    auto t4942 = std::make_shared<ov::op::v0::MatMul>(t4941, t6, false, true);
+    auto t4943 = std::make_shared<ov::op::v1::Multiply>(t4942, t5);
+    auto t4944 = std::make_shared<ov::op::v0::MatMul>(t4943, t4, false, true);
+
+    auto t4945 =
+        std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<size_t>{2, 3, 0, 1});
+    auto t4946 = std::make_shared<ov::op::v1::Transpose>(t4944, t4945);
+
+    // Mix LoRA part into normally computed activations after the "main" MatMul
+    auto tz = std::make_shared<ov::op::v1::Add>(tx, t4946);
+
+    auto result_x = std::make_shared<ov::op::v0::Result>(tx);
+    auto result_z = std::make_shared<ov::op::v0::Result>(tz);
+
+    function = std::make_shared<ov::Model>(ov::ResultVector({result_x, result_z}),
+                                           ov::SinkVector({t4_assign, t5_assign, t6_assign}),
+                                           ov::ParameterVector({param_y}));
+}
+
+}  // namespace test
+}  // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp b/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp
index 46ea730ac32a8c..a1848903bb76a2 100644
--- a/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp
+++ b/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp
@@ -1027,5 +1027,129 @@ std::shared_ptr<ov::Model> RoPETestGPTJSlice::buildROPE_GPTJ(int num_head,
     return std::make_shared<ov::Model>(model_output, ov::ParameterVector{input, sincos});
 }
 
+std::shared_ptr<ov::Model> RoPETestChatGLM2DRoPEStridedSlice::buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims) {
+    auto input = std::make_shared<ov::opset1::Parameter>(ov::element::f32, PartialShape{batch, -1, 4096 + 256 + 256});
+    auto cos_sin_cache = std::make_shared<ov::opset1::Parameter>(ov::element::f32, PartialShape{32768, 32, 2});
+    auto position_ids = std::make_shared<ov::opset1::Parameter>(ov::element::i32, PartialShape{-1, -1});
+
+    auto __module_transformer_index_67_Gather =
+        makeOP<opset8::Gather>({cos_sin_cache, position_ids, 0}, {{"batch_dims", 0}});
+
+    auto ListUnpack_321 = makeOP<opset1::VariadicSplit>({input, -1, {4096, 256, 256}});
+    auto view_Reshape = makeOP<opset1::Reshape>({ListUnpack_321->output(0), {0, 0, 32, 128}}, {{"special_zero", true}});
+
+    auto permute_Transpose = makeOP<opset1::Transpose>({view_Reshape, {0, 2, 1, 3}}, {});
+
+    auto slice_Slice_357 =
+        makeOP<opset1::StridedSlice>({permute_Transpose, {0, 0, 0, 0}, {0, 0, 0, 64}, {1, 1, 1, 1}},
+                                     {{"begin_mask", {1, 1, 1, 0}},
+                                      {"end_mask", {1, 1, 1, 0}},
+                                      {"new_axis_mask", {}},
+                                      {"shrink_axis_mask", {}},
+                                      {"ellipsis_mask", {}}});
+
+    auto aten_view_Reshape_1 = makeOP<opset1::Reshape>({ListUnpack_321->output(1), {0, 0, 2, 128}}, {{"special_zero", true}});
+    auto aten_transpose_1 = makeOP<opset8::Transpose>({aten_view_Reshape_1, {0, 2, 1, 3}});
+    auto shape_of_105249 = makeOP<opset8::ShapeOf>({aten_transpose_1}, {{"output_type", "i32"}});
+    auto gather_105252 = makeOP<opset8::Gather>({shape_of_105249, {2}, {0}}, {{"batch_dims", 0}});
+    auto scatter_update_63441 = makeOP<opset8::ScatterUpdate>({{0, 0}, {1}, gather_105252, {0}});
+    // connected to cos_sin_cache
+    auto slice_Slice_369 =
+        makeOP<opset1::StridedSlice>({__module_transformer_index_67_Gather, {0, 0}, scatter_update_63441, {1, 1}},
+                                     {{"begin_mask", {1, 0}},
+                                      {"end_mask", {1, 0}},
+                                      {"new_axis_mask", {}},
+                                      {"shrink_axis_mask", {}},
+                                      {"ellipsis_mask", {}}});
+    auto list_construct_concat_1 = makeOP<opset1::Concat>({{-1}, {1}, gather_105252, {32}, {2}}, {{"axis", 0}});
+
+    auto reshape_Reshape_373 =
+        makeOP<opset1::Reshape>({slice_Slice_357, {0, 32, 0, 32, 2}}, {{"special_zero", true}});
+    auto select_Gather_384 = makeOP<opset8::Gather>({reshape_Reshape_373, 0, -1}, {{"batch_dims", 0}});//x_even
+    auto select_Gather_381 = makeOP<opset8::Gather>({reshape_Reshape_373, 1, -1}, {{"batch_dims", 0}});//x_odd
+
+    auto view_Reshape_380 =
+        makeOP<opset1::Reshape>({slice_Slice_369, list_construct_concat_1}, {{"special_zero", false}});
+    auto select_Gather_385 = makeOP<opset8::Gather>({view_Reshape_380, 0, -1}, {{"batch_dims", 0}});//cos_tab
+    auto select_Gather_382 = makeOP<opset8::Gather>({view_Reshape_380, 1, -1}, {{"batch_dims", 0}});//sin_tab
+
+    auto mul_Multiply_386 =
+        makeOP<opset1::Multiply>({select_Gather_381, select_Gather_382}, {{"auto_broadcast", "numpy"}});//x_odd_sin
+    auto mul_Multiply_383 =
+        makeOP<opset1::Multiply>({select_Gather_384, select_Gather_385}, {{"auto_broadcast", "numpy"}});//x_even_cos
+    auto sub_Subtract_389 =
+        makeOP<opset1::Subtract>({mul_Multiply_383, mul_Multiply_386}, {{"auto_broadcast", "numpy"}});
+
+    auto mul_Multiply_391 =
+        makeOP<opset1::Multiply>({select_Gather_381, select_Gather_385}, {{"auto_broadcast", "numpy"}});//x_odd_cos
+    auto mul_Multiply_393 =
+        makeOP<opset1::Multiply>({select_Gather_384, select_Gather_382}, {{"auto_broadcast", "numpy"}});//x_even_sin
+    auto add_Add_396 = makeOP<opset1::Add>({mul_Multiply_391, mul_Multiply_393}, {{"auto_broadcast", "numpy"}});
+
+    auto Unsqueeze_62716 = makeOP<opset1::Unsqueeze>({sub_Subtract_389, -1}, {});
+    auto Unsqueeze_62717 = makeOP<opset1::Unsqueeze>({add_Add_396, -1}, {});
+
+    auto stack_401 = makeOP<opset1::Concat>({Unsqueeze_62716, Unsqueeze_62717}, {{"axis", -1}});
+    auto flatten_Reshape_421 = makeOP<opset1::Reshape>({stack_401, {0, 32, 0, 64}}, {{"special_zero", true}});
+    auto slice_Slice_363 =
+        makeOP<opset1::StridedSlice>({permute_Transpose, {0, 0, 0, 64}, {0, 0, 0, INT_MAX}, {1, 1, 1, 1}},
+                                     {{"begin_mask", {1, 1, 1, 0}},
+                                      {"end_mask", {1, 1, 1, 0}},
+                                      {"new_axis_mask", {}},
+                                      {"shrink_axis_mask", {}},
+                                      {"ellipsis_mask", {}}});
+    auto cat_Concat_425 = makeOP<opset1::Concat>({flatten_Reshape_421, slice_Slice_363}, {{"axis", -1}});
+    return std::make_shared<ov::Model>(ov::NodeVector{cat_Concat_425},
+                                       ov::ParameterVector{input, cos_sin_cache, position_ids});
+}
+
+ov::Tensor RoPETestChatGLM2DRoPEStridedSlice::create_i32_tensor(const ov::Shape& shape, int start, int step) {
+    auto tensor = ov::Tensor(ov::element::i32, shape);
+    auto* ptr = static_cast<int32_t*>(tensor.data());
+    for (size_t i = 0; i < tensor.get_size(); i++) {
+        ptr[i] = start;
+        start += step;
+    }
+    return tensor;
+}
+
+void RoPETestChatGLM2DRoPEStridedSlice::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
+    const auto& funcInputs = function->inputs();
+
+    auto& input_shape = targetInputStaticShapes[0];
+    auto batch = input_shape[0];
+    auto seq_length = input_shape[1];
+
+    ov::Tensor t_input = utils::create_and_fill_tensor(funcInputs[0].get_element_type(), input_shape, 2, -1.0f, 32768);
+    ov::Tensor t_cos_sin_cache =
+        utils::create_and_fill_tensor(funcInputs[1].get_element_type(), {32768, 32, 2}, 2, -1.0f, 32768);
+    ov::Tensor t_position_ids = create_i32_tensor(ov::Shape({batch, seq_length}), 15);
+
+    inputs.clear();
+    inputs.insert({funcInputs[0].get_node_shared_ptr(), t_input});
+    inputs.insert({funcInputs[1].get_node_shared_ptr(), t_cos_sin_cache});
+    inputs.insert({funcInputs[2].get_node_shared_ptr(), t_position_ids});
+}
+
+void RoPETestChatGLM2DRoPEStridedSlice::SetUp() {
+    targetDevice = this->GetParam();
+
+    const int batch = 2;
+    const int seq_length = 7;
+    const int num_head = 32;
+    const int rotary_dims = 64;
+
+    InputShape inpShape = {{batch, -1, 4096 + 256 + 256}, {{batch, seq_length, 4096 + 256 + 256}}};
+    init_input_shapes({inpShape});
+    function = buildROPE_ChatGLM(-1, num_head, rotary_dims);
+}
+
+std::string RoPETestChatGLM2DRoPEStridedSlice::getTestCaseName(const testing::TestParamInfo<std::string>& obj) {
+    std::string targetDevice = obj.param;
+    std::ostringstream result;
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
 }  // namespace test
 }  // namespace ov
diff --git a/tests/constraints.txt b/tests/constraints.txt
index f09da0d3b409e9..616aea79c82153 100644
--- a/tests/constraints.txt
+++ b/tests/constraints.txt
@@ -1,4 +1,4 @@
-numpy>=1.16.6,<1.27
+numpy>=1.16.6,<2.1.0
 attrs==23.2.0
 distro==1.9.0
 h5py>=3.1.0,<3.12.0
@@ -6,7 +6,6 @@ Jinja2>=2.11.2
 pandas>=1.3.5
 pymongo>=3.12.0
 PyYAML>=5.4.1
-scipy>=1.7; python_version <= "3.8"
 scipy>=1.11.1; python_version >= "3.9"
 sympy>=1.10
 wheel>=0.38.1
diff --git a/tests/layer_tests/requirements.txt b/tests/layer_tests/requirements.txt
index 6799b32036df97..cb8e71f0c7fe7f 100644
--- a/tests/layer_tests/requirements.txt
+++ b/tests/layer_tests/requirements.txt
@@ -4,9 +4,9 @@ numpy
 onnxruntime
 requests
 torch
-torchvision; platform_machine == 'arm64' and python_version >= '3.8'
+torchvision; platform_machine == 'arm64' and python_version >= '3.9'
 torchvision; platform_machine != 'arm64'
-sympy; platform_machine == 'arm64' and python_version >= '3.8'
+sympy; platform_machine == 'arm64' and python_version >= '3.9'
 sympy; platform_machine != 'arm64'
 transformers
 packaging
diff --git a/thirdparty/open_model_zoo b/thirdparty/open_model_zoo
index f798fd62d66c27..e7df86da686d2e 160000
--- a/thirdparty/open_model_zoo
+++ b/thirdparty/open_model_zoo
@@ -1 +1 @@
-Subproject commit f798fd62d66c273c757ab9c6038a47a364b726d0
+Subproject commit e7df86da686d2e1600282422e54f66c2fecea160
diff --git a/tools/ovc/openvino/tools/ovc/convert.py b/tools/ovc/openvino/tools/ovc/convert.py
index 782fa25ab2dd8b..77693ad4be2ca1 100644
--- a/tools/ovc/openvino/tools/ovc/convert.py
+++ b/tools/ovc/openvino/tools/ovc/convert.py
@@ -27,7 +27,7 @@ def convert_model(
 
     Framework-agnostic parameters:
         :param input_model:
-            Model object in original framework (PyTorch, Tensorflow) or path to model file.
+            Model object in original framework (PyTorch, TensorFlow) or path to model file.
 
             Supported formats of input model:
 
@@ -35,6 +35,7 @@ def convert_model(
             torch.nn.Module
             torch.jit.ScriptModule
             torch.jit.ScriptFunction
+            torch.export.ExportedProgram
 
             TF
             tf.compat.v1.Graph
diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py
index 0119a541494cb9..d3b77c9a61f566 100644
--- a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py
+++ b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py
@@ -89,37 +89,38 @@ def get_pytorch_decoder_for_model_on_disk(argv, args):
     else:
         input_model = argv.input_model
 
-    if isinstance(input_model, (str, pathlib.Path)):
-        # attempt to load scripted model
-        try:
-            inputs = prepare_torch_inputs(example_inputs)
-            model = torch.jit.load(input_model)
-            model.eval()
-            decoder = TorchScriptPythonDecoder(
-                model,
-                example_input=inputs,
-                shared_memory=args.get("share_weights", True),
-                module_extensions=extract_module_extensions(args))
+    if not isinstance(input_model, (str, pathlib.Path)):
+        return False
+
+    # attempt to load scripted model
+    try:
+        inputs = prepare_torch_inputs(example_inputs)
+        model = torch.jit.load(input_model)
+        model.eval()
+        decoder = TorchScriptPythonDecoder(
+            model,
+            example_input=inputs,
+            shared_memory=args.get("share_weights", True),
+            module_extensions=extract_module_extensions(args))
+        argv.input_model = decoder
+        argv.framework = 'pytorch'
+        return True
+    except:
+        pass
+    # attempt to load exported model
+    try:
+        exported_program = torch.export.load(input_model)
+        if hasattr(torch, "export") and isinstance(exported_program, (torch.export.ExportedProgram)):
+            from packaging import version
+            if version.parse(torch.__version__) >= version.parse("2.2"):
+                exported_program = exported_program.run_decompositions()
+            gm = exported_program.module()
+            decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True)
             argv.input_model = decoder
             argv.framework = 'pytorch'
             return True
-        except:
-            pass
-    if isinstance(input_model, (str, pathlib.Path)):
-        # attempt to load exported model
-        try:
-            exported_program = torch.export.load(input_model)
-            if hasattr(torch, "export") and isinstance(exported_program, (torch.export.ExportedProgram)):
-                from packaging import version
-                if version.parse(torch.__version__) >= version.parse("2.2"):
-                    exported_program = exported_program.run_decompositions()
-                gm = exported_program.module()
-                decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True)
-                argv.input_model = decoder
-                argv.framework = 'pytorch'
-                return True
-        except:
-            pass
+    except:
+        pass
     return False