diff --git a/.github/dockerfiles/docker_tag b/.github/dockerfiles/docker_tag
index 094e08dade5967..7a82a65ff487c6 100644
--- a/.github/dockerfiles/docker_tag
+++ b/.github/dockerfiles/docker_tag
@@ -1 +1 @@
-pr-24598
+pr-24573
\ No newline at end of file
diff --git a/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile
new file mode 100644
index 00000000000000..b13bfe1f2df316
--- /dev/null
+++ b/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile
@@ -0,0 +1,72 @@
+FROM openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04
+
+USER root
+
+# APT configuration
+RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \
+    echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \
+    echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \
+    echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf
+
+ENV DEBIAN_FRONTEND="noninteractive" \
+    TZ="Europe/London"
+
+RUN apt-get update && \
+    apt-get install software-properties-common && \
+    add-apt-repository --yes --no-update ppa:git-core/ppa && \
+    add-apt-repository --yes --no-update ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install \
+        curl \
+        git \
+        ca-certificates \
+        gpg-agent \
+        tzdata \
+        # Pythons
+        python3.8-dev \
+        python3.8-venv \
+        python3.8-distutils \
+        python3.11-dev \
+        python3.11-venv \
+        python3.11-distutils \
+        # For Java API
+        default-jdk \
+        # Compiler \
+        gcc-10 \
+        g++-10 \
+        && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install build dependencies
+ADD install_build_dependencies.sh /install_build_dependencies.sh
+RUN chmod +x /install_build_dependencies.sh && \
+    /install_build_dependencies.sh && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set gcc-10 as a default compiler
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30
+
+# Install sscache
+ARG SCCACHE_VERSION="v0.7.5"
+ENV SCCACHE_HOME="/opt/sccache" \
+    SCCACHE_PATH="/opt/sccache/sccache"
+
+RUN mkdir ${SCCACHE_HOME} && cd ${SCCACHE_HOME} && \
+    SCCACHE_ARCHIVE="sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" && \
+    curl -SLO https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/${SCCACHE_ARCHIVE} && \
+    tar -xzf ${SCCACHE_ARCHIVE} --strip-components=1 && rm ${SCCACHE_ARCHIVE}
+
+# Setup pip
+ENV PIP_VERSION="24.0"
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    python3.8 get-pip.py  --no-cache-dir pip==${PIP_VERSION} && \
+    python3.11 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \
+    rm -f get-pip.py
+
+# Use Python 3.11 as default instead of Python 3.8
+# Using venv here 'cause other methods to switch the default Python on Ubuntu 20 break both system and wheels build
+RUN python3.11 -m venv venv
+ENV PATH="/venv/bin:$SCCACHE_HOME:$PATH"
+
+ENV PIP_CACHE_DIR=/mount/caches/pip/linux/${PIP_VERSION}
diff --git a/.github/dockerfiles/ov_build/ubuntu_22_04_x64/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_22_04_x64/Dockerfile
new file mode 100644
index 00000000000000..cb3e4cc639e0a9
--- /dev/null
+++ b/.github/dockerfiles/ov_build/ubuntu_22_04_x64/Dockerfile
@@ -0,0 +1,73 @@
+FROM openvinogithubactions.azurecr.io/dockerhub/ubuntu:22.04
+
+USER root
+
+# APT configuration
+RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \
+    echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \
+    echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \
+    echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf
+
+ENV DEBIAN_FRONTEND="noninteractive" \
+    TZ="Europe/London"
+
+RUN apt-get update && \
+    apt-get install software-properties-common && \
+    add-apt-repository --yes --no-update ppa:git-core/ppa && \
+    add-apt-repository --yes --no-update ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install \
+        curl \
+        git \
+        ca-certificates \
+        gpg-agent \
+        tzdata \
+        libtbb2 \
+        # Pythons
+        python3.8-dev \
+        python3.8-venv \
+        python3.8-distutils \
+        python3.11-dev \
+        python3.11-venv \
+        python3.11-distutils \
+        # For Java API
+        default-jdk \
+        # Compiler \
+        gcc-10 \
+        g++-10 \
+        && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install build dependencies
+ADD install_build_dependencies.sh /install_build_dependencies.sh
+RUN chmod +x /install_build_dependencies.sh && \
+    /install_build_dependencies.sh && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set gcc-10 as a default compiler
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30
+
+# Install sscache
+ARG SCCACHE_VERSION="v0.7.5"
+ENV SCCACHE_HOME="/opt/sccache" \
+    SCCACHE_PATH="/opt/sccache/sccache"
+
+RUN mkdir ${SCCACHE_HOME} && cd ${SCCACHE_HOME} && \
+    SCCACHE_ARCHIVE="sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" && \
+    curl -SLO https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/${SCCACHE_ARCHIVE} && \
+    tar -xzf ${SCCACHE_ARCHIVE} --strip-components=1 && rm ${SCCACHE_ARCHIVE}
+
+# Setup pip
+ENV PIP_VERSION="24.0"
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    python3.8 get-pip.py  --no-cache-dir pip==${PIP_VERSION} && \
+    python3.11 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \
+    rm -f get-pip.py
+
+# Use Python 3.11 as default instead of Python 3.8
+# Using venv here 'cause other methods to switch the default Python on Ubuntu 20 break both system and wheels build
+RUN python3.11 -m venv venv
+ENV PATH="/venv/bin:$SCCACHE_HOME:$PATH"
+
+ENV PIP_CACHE_DIR=/mount/caches/pip/linux/${PIP_VERSION}
diff --git a/.github/workflows/job_onnx_runtime.yml b/.github/workflows/job_onnx_runtime.yml
index 31420d793b14cd..b1d7060b6bce33 100644
--- a/.github/workflows/job_onnx_runtime.yml
+++ b/.github/workflows/job_onnx_runtime.yml
@@ -40,10 +40,6 @@ jobs:
       ONNX_RUNTIME_UTILS: ${{ github.workspace }}/install/onnxruntime
       ONNX_RUNTIME_BUILD_DIR: ${{ github.workspace }}/onnxruntime/build
     steps:
-      - name: Set apt retries
-        if: runner.os == 'Linux'
-        run: echo 'Acquire::Retries "10";' > /etc/apt/apt.conf.d/80-retries
-
       - name: Download OpenVINO package
         uses: actions/download-artifact@v4
         with:
@@ -59,38 +55,12 @@ jobs:
           echo "ONNX_RUNTIME_UTILS=$GITHUB_WORKSPACE/install/onnxruntime" >> "$GITHUB_ENV"
           echo "ONNX_RUNTIME_BUILD_DIR=$GITHUB_WORKSPACE/onnxruntime/build" >> "$GITHUB_ENV"
 
-      - name: Fetch install_build_dependencies.sh and setup_python action
-        uses: actions/checkout@v4
-        with:
-          sparse-checkout: |
-            install_build_dependencies.sh
-            .github/actions/setup_python/action.yml
-          sparse-checkout-cone-mode: false
-          path: 'openvino'
-
-      - name: Install git
-        run: |
-          apt-get update
-          apt-get install --assume-yes --no-install-recommends git ca-certificates
-
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
-        uses: ./openvino/.github/actions/setup_python
-        with:
-          version: '3.11'
-          should-setup-pip-paths: 'false'
-
       - name: Extract OpenVINO package
         run: |
           pushd ${INSTALL_DIR}
             tar -xzf openvino_package.tar.gz -C ${INSTALL_DIR}
           popd
 
-      - name: Install OpenVINO dependencies
-        run: |
-          ${INSTALL_DIR}/install_dependencies/install_openvino_dependencies.sh -c=core -c=dev -y
-          # since we are on Ubuntu 22.04, but compiled OpenVINO on Ubuntu 20.04, we need to install `libtbb2`
-          apt-get install --assume-yes --no-install-recommends libtbb2
-
       - name: Clone ONNX Runtime
         run: |
           hash=`tr -s '\n ' < ${ONNX_RUNTIME_UTILS}/version`
@@ -102,14 +72,6 @@ jobs:
       # Tests
       #
 
-      - name: Install Build Dependencies
-        run: bash ${OPENVINO_REPO}/install_build_dependencies.sh
-
-      - name: Install sccache
-        uses: mozilla-actions/sccache-action@v0.0.4
-        with:
-          version: "v0.7.5"
-
       - name: Build Lin ONNX Runtime
         run: |
           source ${INSTALL_DIR}/setupvars.sh
@@ -133,7 +95,7 @@ jobs:
         if: ${{ runner.arch != 'ARM64' }} # Ticket: 126277
         run: |
           # see https://github.com/microsoft/onnxruntime/issues/13197#issuecomment-1264542497
-          apt-get install --assume-yes --no-install-recommends language-pack-en
+          apt-get update && apt-get install --assume-yes --no-install-recommends language-pack-en
           locale-gen en_US.UTF-8
           update-locale LANG=en_US.UTF-8
 
diff --git a/.github/workflows/job_tokenizers.yml b/.github/workflows/job_tokenizers.yml
index 23eca8cd5bb32a..5198ee5db996ae 100644
--- a/.github/workflows/job_tokenizers.yml
+++ b/.github/workflows/job_tokenizers.yml
@@ -56,6 +56,7 @@ jobs:
             install_build_dependencies.sh
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
+        if: ${{ runner.os != 'Linux' }} # We do not need to install Python on Linux as we use Docker with it installed
         uses: ./.github/actions/setup_python
         with:
           version: ${{ env.PYTHON_VERSION }}
@@ -94,10 +95,6 @@ jobs:
       # Dependencies
       #
 
-      - name: Install build dependencies (Linux)
-        if: runner.os == 'Linux'
-        run: ./install_build_dependencies.sh
-
       - name: Install python dependencies
         run: |
           # wheel packaging
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 5b4e6769cc1350..6b43a90fb9f61a 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -25,6 +25,7 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       affected_components: "${{ steps.smart_ci.outputs.affected_components }}"
+      changed_components: "${{ steps.smart_ci.outputs.changed_components }}"
       skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}"
     steps:
       - name: checkout action
@@ -45,15 +46,42 @@ jobs:
           skip_when_only_listed_labels_set: 'docs'
           skip_when_only_listed_files_changed: '*.md,*.rst,*.png,*.jpg,*.svg'
 
-  Build:
+      - name: Show affected components
+        run: |
+          echo "${{ toJSON(steps.smart_ci.outputs.affected_components) }}"
+        shell: bash
+
+  Docker:
     needs: Smart_CI
+    runs-on: aks-linux-4-cores-16gb-docker-build
+    container:
+      image: openvinogithubactions.azurecr.io/docker_build:0.2
+      volumes:
+        - /mount:/mount
+    outputs:
+      images: "${{ steps.handle_docker.outputs.images }}"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - uses: ./.github/actions/handle_docker
+        id: handle_docker
+        with:
+          images: |
+            ov_build/ubuntu_20_04_x64
+          registry: 'openvinogithubactions.azurecr.io'
+          dockerfiles_root_dir: '.github/dockerfiles'
+          changed_components: ${{ needs.smart_ci.outputs.changed_components }}
+
+  Build:
+    needs: [Docker, Smart_CI]
     timeout-minutes: 150
     defaults:
       run:
         shell: bash
     runs-on: aks-linux-16-cores-32gb
     container:
-      image: openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04
+      image: ${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_x64 }}
       volumes:
         - /mount:/mount
       options: -e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING
@@ -80,20 +108,21 @@ jobs:
     if: "!needs.smart_ci.outputs.skip_workflow"
 
     steps:
-      - name: Set apt retries
-        run: echo 'Acquire::Retries "10";' > /etc/apt/apt.conf.d/80-retries
-
-      - name: Install git
-        run: |
-          apt-get update
-          apt-get install --assume-yes --no-install-recommends git ca-certificates
-
       - name: Clone OpenVINO
         uses: actions/checkout@v4
         with:
           path: ${{ env.OPENVINO_REPO }}
           submodules: 'true'
 
+      # Ticket: 139627
+      - name: Checkout the latest OneDNN for GPU in nightly
+        if: ${{ github.event_name == 'schedule' }}
+        working-directory: ${{ env.OPENVINO_REPO }}/src/plugins/intel_gpu/thirdparty/onednn_gpu
+        run: |
+          git fetch origin
+          git checkout main
+          git rev-parse HEAD
+
       - name: Clone OpenVINO Contrib
         uses: actions/checkout@v4
         with:
@@ -113,26 +142,6 @@ jobs:
       # Dependencies
       #
 
-      - name: Install build dependencies
-        run: |
-          bash ${OPENVINO_REPO}/install_build_dependencies.sh
-          # default-jdk - Java API
-          apt install --assume-yes --no-install-recommends default-jdk
-
-      - name: Install sccache
-        uses: mozilla-actions/sccache-action@v0.0.4
-        with:
-          version: "v0.7.5"
-
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
-        uses: ./openvino/.github/actions/setup_python
-        with:
-          version: ${{ env.PYTHON_VERSION }}
-          pip-cache-path: ${{ env.PIP_CACHE_PATH }}
-          should-setup-pip-paths: 'true'
-          self-hosted-runner: 'true'
-          show-cache-info: 'true'
-
       - name: Install python dependencies
         run: |
           # For Python API: build and wheel packaging
@@ -450,12 +459,12 @@ jobs:
     name: ONNX Runtime Integration
     if: fromJSON(needs.smart_ci.outputs.affected_components).ONNX_RT ||
         fromJSON(needs.smart_ci.outputs.affected_components).ONNX_FE
-    needs: [ Build, Smart_CI ]
+    needs: [ Build, Smart_CI, Docker ]
     uses: ./.github/workflows/job_onnx_runtime.yml
     with:
       runner: 'aks-linux-16-cores-32gb'
-      container: '{"image": "openvinogithubactions.azurecr.io/dockerhub/ubuntu:22.04", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}'
-      sccache-azure-key-prefix: 'ubuntu22_x86_64_onnxruntime'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_x64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}'
+      sccache-azure-key-prefix: 'ubuntu20_x86_64_onnxruntime'
 
   ONNX_Models:
     name: ONNX Models Tests
@@ -676,12 +685,12 @@ jobs:
 
   Openvino_tokenizers:
     name: OpenVINO tokenizers extension
-    needs: [ Build, Smart_CI ]
+    needs: [ Build, Smart_CI, Docker ]
     uses: ./.github/workflows/job_tokenizers.yml
     with:
       runner: 'aks-linux-4-cores-16gb'
       shell: bash
-      container: '{"image": "openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04", "volumes": ["/mount:/mount"]}'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_x64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
     if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS
 
diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml
index 58d3184d8cf276..2d50900d157e3e 100644
--- a/.github/workflows/linux_arm64.yml
+++ b/.github/workflows/linux_arm64.yml
@@ -323,12 +323,12 @@ jobs:
     name: ONNX Runtime Integration
     if: fromJSON(needs.smart_ci.outputs.affected_components).ONNX_RT ||
         fromJSON(needs.smart_ci.outputs.affected_components).ONNX_FE
-    needs: [ Build, Smart_CI ]
+    needs: [ Build, Smart_CI, Docker ]
     uses: ./.github/workflows/job_onnx_runtime.yml
     with:
       runner: 'aks-linux-16-cores-arm'
-      container: '{"image": "openvinogithubactions.azurecr.io/dockerhub/ubuntu:22.04", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}'
-      sccache-azure-key-prefix: 'ubuntu22_aarch64_onnxruntime'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}'
+      sccache-azure-key-prefix: 'ubuntu20_aarch64_onnxruntime'
 
   Openvino_tokenizers:
     name: OpenVINO tokenizers extension
diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst
index aef6b8be5c6d11..bb874ea459380d 100644
--- a/docs/articles_en/about-openvino/performance-benchmarks.rst
+++ b/docs/articles_en/about-openvino/performance-benchmarks.rst
@@ -163,7 +163,7 @@ connection is dedicated only to measuring performance.
 
    The benchmark setup for OVMS consists of four main parts:
 
-   .. image:: ../_static/images/performance_benchmarks_ovms_02.png
+   .. image:: ../assets/images/performance_benchmarks_ovms_02.png
       :alt: OVMS Benchmark Setup Diagram
 
    * **OpenVINO™ Model Server** is launched as a docker container on the server platform and it
diff --git a/docs/sphinx_setup/_static/images/BASIC_FLOW_IE_C.svg b/docs/articles_en/assets/images/BASIC_FLOW_IE_C.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/BASIC_FLOW_IE_C.svg
rename to docs/articles_en/assets/images/BASIC_FLOW_IE_C.svg
diff --git a/docs/sphinx_setup/_static/images/DEVELOPMENT_FLOW_V3_crunch.svg b/docs/articles_en/assets/images/DEVELOPMENT_FLOW_V3_crunch.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/DEVELOPMENT_FLOW_V3_crunch.svg
rename to docs/articles_en/assets/images/DEVELOPMENT_FLOW_V3_crunch.svg
diff --git a/docs/sphinx_setup/_static/images/DeepSpeech-0.8.2.png b/docs/articles_en/assets/images/DeepSpeech-0.8.2.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/DeepSpeech-0.8.2.png
rename to docs/articles_en/assets/images/DeepSpeech-0.8.2.png
diff --git a/docs/sphinx_setup/_static/images/DeviceDriverVersion.svg b/docs/articles_en/assets/images/DeviceDriverVersion.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/DeviceDriverVersion.svg
rename to docs/articles_en/assets/images/DeviceDriverVersion.svg
diff --git a/docs/sphinx_setup/_static/images/DeviceManager.PNG b/docs/articles_en/assets/images/DeviceManager.PNG
similarity index 100%
rename from docs/sphinx_setup/_static/images/DeviceManager.PNG
rename to docs/articles_en/assets/images/DeviceManager.PNG
diff --git a/docs/sphinx_setup/_static/images/FaceNet.svg b/docs/articles_en/assets/images/FaceNet.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/FaceNet.svg
rename to docs/articles_en/assets/images/FaceNet.svg
diff --git a/docs/sphinx_setup/_static/images/IMPLEMENT_PIPELINE_with_API_C.svg b/docs/articles_en/assets/images/IMPLEMENT_PIPELINE_with_API_C.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/IMPLEMENT_PIPELINE_with_API_C.svg
rename to docs/articles_en/assets/images/IMPLEMENT_PIPELINE_with_API_C.svg
diff --git a/docs/sphinx_setup/_static/images/MO_connection_example_1.svg b/docs/articles_en/assets/images/MO_connection_example_1.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/MO_connection_example_1.svg
rename to docs/articles_en/assets/images/MO_connection_example_1.svg
diff --git a/docs/sphinx_setup/_static/images/MO_conversion_pipeline.svg b/docs/articles_en/assets/images/MO_conversion_pipeline.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/MO_conversion_pipeline.svg
rename to docs/articles_en/assets/images/MO_conversion_pipeline.svg
diff --git a/docs/sphinx_setup/_static/images/MO_graph_after_extractors.svg b/docs/articles_en/assets/images/MO_graph_after_extractors.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/MO_graph_after_extractors.svg
rename to docs/articles_en/assets/images/MO_graph_after_extractors.svg
diff --git a/docs/sphinx_setup/_static/images/MO_graph_after_loader.svg b/docs/articles_en/assets/images/MO_graph_after_loader.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/MO_graph_after_loader.svg
rename to docs/articles_en/assets/images/MO_graph_after_loader.svg
diff --git a/docs/sphinx_setup/_static/images/MO_graph_before_partial_inference.svg b/docs/articles_en/assets/images/MO_graph_before_partial_inference.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/MO_graph_before_partial_inference.svg
rename to docs/articles_en/assets/images/MO_graph_before_partial_inference.svg
diff --git a/docs/sphinx_setup/_static/images/MO_ports_example_1.svg b/docs/articles_en/assets/images/MO_ports_example_1.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/MO_ports_example_1.svg
rename to docs/articles_en/assets/images/MO_ports_example_1.svg
diff --git a/docs/sphinx_setup/_static/images/MO_ports_example_2.svg b/docs/articles_en/assets/images/MO_ports_example_2.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/MO_ports_example_2.svg
rename to docs/articles_en/assets/images/MO_ports_example_2.svg
diff --git a/docs/sphinx_setup/_static/images/MO_transformations_graph.svg b/docs/articles_en/assets/images/MO_transformations_graph.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/MO_transformations_graph.svg
rename to docs/articles_en/assets/images/MO_transformations_graph.svg
diff --git a/docs/sphinx_setup/_static/images/NCF_start.svg b/docs/articles_en/assets/images/NCF_start.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/NCF_start.svg
rename to docs/articles_en/assets/images/NCF_start.svg
diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png
rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png
diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png
rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png
diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png
rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png
diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png
rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png
diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png
rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png
diff --git a/docs/sphinx_setup/_static/images/WHAT_TO_USE.svg b/docs/articles_en/assets/images/WHAT_TO_USE.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/WHAT_TO_USE.svg
rename to docs/articles_en/assets/images/WHAT_TO_USE.svg
diff --git a/docs/sphinx_setup/_static/images/add.common.png b/docs/articles_en/assets/images/add.common.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/add.common.png
rename to docs/articles_en/assets/images/add.common.png
diff --git a/docs/sphinx_setup/_static/images/add.transformed.png b/docs/articles_en/assets/images/add.transformed.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/add.transformed.png
rename to docs/articles_en/assets/images/add.transformed.png
diff --git a/docs/sphinx_setup/_static/images/applying_low_latency_2.svg b/docs/articles_en/assets/images/applying_low_latency_2.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/applying_low_latency_2.svg
rename to docs/articles_en/assets/images/applying_low_latency_2.svg
diff --git a/docs/sphinx_setup/_static/images/autoplugin_accelerate.svg b/docs/articles_en/assets/images/autoplugin_accelerate.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/autoplugin_accelerate.svg
rename to docs/articles_en/assets/images/autoplugin_accelerate.svg
diff --git a/docs/sphinx_setup/_static/images/batch_device.svg b/docs/articles_en/assets/images/batch_device.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/batch_device.svg
rename to docs/articles_en/assets/images/batch_device.svg
diff --git a/docs/sphinx_setup/_static/images/batch_relaxation.png b/docs/articles_en/assets/images/batch_relaxation.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/batch_relaxation.png
rename to docs/articles_en/assets/images/batch_relaxation.png
diff --git a/docs/sphinx_setup/_static/images/caching_enabled.svg b/docs/articles_en/assets/images/caching_enabled.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/caching_enabled.svg
rename to docs/articles_en/assets/images/caching_enabled.svg
diff --git a/docs/sphinx_setup/_static/images/caching_times.svg b/docs/articles_en/assets/images/caching_times.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/caching_times.svg
rename to docs/articles_en/assets/images/caching_times.svg
diff --git a/docs/sphinx_setup/_static/images/compressed_int8_Convolution_weights.png b/docs/articles_en/assets/images/compressed_int8_Convolution_weights.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/compressed_int8_Convolution_weights.png
rename to docs/articles_en/assets/images/compressed_int8_Convolution_weights.png
diff --git a/docs/sphinx_setup/_static/images/cpu_execution_conventional_approach.svg b/docs/articles_en/assets/images/cpu_execution_conventional_approach.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/cpu_execution_conventional_approach.svg
rename to docs/articles_en/assets/images/cpu_execution_conventional_approach.svg
diff --git a/docs/sphinx_setup/_static/images/cpu_execution_streams.svg b/docs/articles_en/assets/images/cpu_execution_streams.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/cpu_execution_streams.svg
rename to docs/articles_en/assets/images/cpu_execution_streams.svg
diff --git a/docs/sphinx_setup/_static/images/cpu_execution_streams_2.svg b/docs/articles_en/assets/images/cpu_execution_streams_2.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/cpu_execution_streams_2.svg
rename to docs/articles_en/assets/images/cpu_execution_streams_2.svg
diff --git a/docs/sphinx_setup/_static/images/datumaro.png b/docs/articles_en/assets/images/datumaro.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/datumaro.png
rename to docs/articles_en/assets/images/datumaro.png
diff --git a/docs/sphinx_setup/_static/images/deploy_encrypted_model.svg b/docs/articles_en/assets/images/deploy_encrypted_model.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/deploy_encrypted_model.svg
rename to docs/articles_en/assets/images/deploy_encrypted_model.svg
diff --git a/docs/sphinx_setup/_static/images/deployment_full.svg b/docs/articles_en/assets/images/deployment_full.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/deployment_full.svg
rename to docs/articles_en/assets/images/deployment_full.svg
diff --git a/docs/sphinx_setup/_static/images/deployment_simplified.svg b/docs/articles_en/assets/images/deployment_simplified.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/deployment_simplified.svg
rename to docs/articles_en/assets/images/deployment_simplified.svg
diff --git a/docs/sphinx_setup/_static/images/dog.png b/docs/articles_en/assets/images/dog.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/dog.png
rename to docs/articles_en/assets/images/dog.png
diff --git a/docs/sphinx_setup/_static/images/expanded_int8_Convolution_weights.png b/docs/articles_en/assets/images/expanded_int8_Convolution_weights.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/expanded_int8_Convolution_weights.png
rename to docs/articles_en/assets/images/expanded_int8_Convolution_weights.png
diff --git a/docs/sphinx_setup/_static/images/fq.common.svg b/docs/articles_en/assets/images/fq.common.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/fq.common.svg
rename to docs/articles_en/assets/images/fq.common.svg
diff --git a/docs/sphinx_setup/_static/images/fq.transformed.svg b/docs/articles_en/assets/images/fq.transformed.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/fq.transformed.svg
rename to docs/articles_en/assets/images/fq.transformed.svg
diff --git a/docs/sphinx_setup/_static/images/fq_and_convolution.common.png b/docs/articles_en/assets/images/fq_and_convolution.common.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/fq_and_convolution.common.png
rename to docs/articles_en/assets/images/fq_and_convolution.common.png
diff --git a/docs/sphinx_setup/_static/images/fq_and_convolution.transformed.png b/docs/articles_en/assets/images/fq_and_convolution.transformed.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/fq_and_convolution.transformed.png
rename to docs/articles_en/assets/images/fq_and_convolution.transformed.png
diff --git a/docs/sphinx_setup/_static/images/fq_fq_and_convolution.common.png b/docs/articles_en/assets/images/fq_fq_and_convolution.common.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/fq_fq_and_convolution.common.png
rename to docs/articles_en/assets/images/fq_fq_and_convolution.common.png
diff --git a/docs/sphinx_setup/_static/images/get_started_with_cpp.jpg b/docs/articles_en/assets/images/get_started_with_cpp.jpg
similarity index 100%
rename from docs/sphinx_setup/_static/images/get_started_with_cpp.jpg
rename to docs/articles_en/assets/images/get_started_with_cpp.jpg
diff --git a/docs/sphinx_setup/_static/images/get_started_with_python.gif b/docs/articles_en/assets/images/get_started_with_python.gif
similarity index 100%
rename from docs/sphinx_setup/_static/images/get_started_with_python.gif
rename to docs/articles_en/assets/images/get_started_with_python.gif
diff --git a/docs/sphinx_setup/_static/images/graph_rewrite_efficient_search.png b/docs/articles_en/assets/images/graph_rewrite_efficient_search.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/graph_rewrite_efficient_search.png
rename to docs/articles_en/assets/images/graph_rewrite_efficient_search.png
diff --git a/docs/sphinx_setup/_static/images/graph_rewrite_execution.png b/docs/articles_en/assets/images/graph_rewrite_execution.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/graph_rewrite_execution.png
rename to docs/articles_en/assets/images/graph_rewrite_execution.png
diff --git a/docs/sphinx_setup/_static/images/inception_v1_first_block.svg b/docs/articles_en/assets/images/inception_v1_first_block.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/inception_v1_first_block.svg
rename to docs/articles_en/assets/images/inception_v1_first_block.svg
diff --git a/docs/sphinx_setup/_static/images/inception_v1_std_input.svg b/docs/articles_en/assets/images/inception_v1_std_input.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/inception_v1_std_input.svg
rename to docs/articles_en/assets/images/inception_v1_std_input.svg
diff --git a/docs/sphinx_setup/_static/images/inception_v1_std_output.svg b/docs/articles_en/assets/images/inception_v1_std_output.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/inception_v1_std_output.svg
rename to docs/articles_en/assets/images/inception_v1_std_output.svg
diff --git a/docs/sphinx_setup/_static/images/large_batch_approach.svg b/docs/articles_en/assets/images/large_batch_approach.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/large_batch_approach.svg
rename to docs/articles_en/assets/images/large_batch_approach.svg
diff --git a/docs/sphinx_setup/_static/images/launch_in_binder.svg b/docs/articles_en/assets/images/launch_in_binder.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/launch_in_binder.svg
rename to docs/articles_en/assets/images/launch_in_binder.svg
diff --git a/docs/sphinx_setup/_static/images/llt2_use_const_initializer.svg b/docs/articles_en/assets/images/llt2_use_const_initializer.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/llt2_use_const_initializer.svg
rename to docs/articles_en/assets/images/llt2_use_const_initializer.svg
diff --git a/docs/sphinx_setup/_static/images/lm_1b.svg b/docs/articles_en/assets/images/lm_1b.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/lm_1b.svg
rename to docs/articles_en/assets/images/lm_1b.svg
diff --git a/docs/sphinx_setup/_static/images/low_latency_limitation_2.svg b/docs/articles_en/assets/images/low_latency_limitation_2.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/low_latency_limitation_2.svg
rename to docs/articles_en/assets/images/low_latency_limitation_2.svg
diff --git a/docs/sphinx_setup/_static/images/low_precision_transformation_pipeline.svg b/docs/articles_en/assets/images/low_precision_transformation_pipeline.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/low_precision_transformation_pipeline.svg
rename to docs/articles_en/assets/images/low_precision_transformation_pipeline.svg
diff --git a/docs/sphinx_setup/_static/images/make_stateful_detailed.png b/docs/articles_en/assets/images/make_stateful_detailed.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/make_stateful_detailed.png
rename to docs/articles_en/assets/images/make_stateful_detailed.png
diff --git a/docs/sphinx_setup/_static/images/make_stateful_simple.svg b/docs/articles_en/assets/images/make_stateful_simple.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/make_stateful_simple.svg
rename to docs/articles_en/assets/images/make_stateful_simple.svg
diff --git a/docs/sphinx_setup/_static/images/model_fq_and_convolution.common.svg b/docs/articles_en/assets/images/model_fq_and_convolution.common.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/model_fq_and_convolution.common.svg
rename to docs/articles_en/assets/images/model_fq_and_convolution.common.svg
diff --git a/docs/sphinx_setup/_static/images/model_fq_and_convolution.transformed.svg b/docs/articles_en/assets/images/model_fq_and_convolution.transformed.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/model_fq_and_convolution.transformed.svg
rename to docs/articles_en/assets/images/model_fq_and_convolution.transformed.svg
diff --git a/docs/sphinx_setup/_static/images/model_qdq_and_convolution.common.svg b/docs/articles_en/assets/images/model_qdq_and_convolution.common.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/model_qdq_and_convolution.common.svg
rename to docs/articles_en/assets/images/model_qdq_and_convolution.common.svg
diff --git a/docs/sphinx_setup/_static/images/nncf_workflow.svg b/docs/articles_en/assets/images/nncf_workflow.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/nncf_workflow.svg
rename to docs/articles_en/assets/images/nncf_workflow.svg
diff --git a/docs/sphinx_setup/_static/images/open_in_colab.svg b/docs/articles_en/assets/images/open_in_colab.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/open_in_colab.svg
rename to docs/articles_en/assets/images/open_in_colab.svg
diff --git a/docs/sphinx_setup/_static/images/original_vs_reshaped_model.svg b/docs/articles_en/assets/images/original_vs_reshaped_model.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/original_vs_reshaped_model.svg
rename to docs/articles_en/assets/images/original_vs_reshaped_model.svg
diff --git a/docs/sphinx_setup/_static/images/ov_insert_node.png b/docs/articles_en/assets/images/ov_insert_node.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/ov_insert_node.png
rename to docs/articles_en/assets/images/ov_insert_node.png
diff --git a/docs/sphinx_setup/_static/images/ov_replace_node.png b/docs/articles_en/assets/images/ov_replace_node.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/ov_replace_node.png
rename to docs/articles_en/assets/images/ov_replace_node.png
diff --git a/docs/sphinx_setup/_static/images/ov_workflow_diagram_convenience.svg b/docs/articles_en/assets/images/ov_workflow_diagram_convenience.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/ov_workflow_diagram_convenience.svg
rename to docs/articles_en/assets/images/ov_workflow_diagram_convenience.svg
diff --git a/docs/sphinx_setup/_static/images/ov_workflow_diagram_performance.svg b/docs/articles_en/assets/images/ov_workflow_diagram_performance.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/ov_workflow_diagram_performance.svg
rename to docs/articles_en/assets/images/ov_workflow_diagram_performance.svg
diff --git a/docs/sphinx_setup/_static/images/ovsa_diagram.svg b/docs/articles_en/assets/images/ovsa_diagram.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/ovsa_diagram.svg
rename to docs/articles_en/assets/images/ovsa_diagram.svg
diff --git a/docs/sphinx_setup/_static/images/ovsa_example.svg b/docs/articles_en/assets/images/ovsa_example.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/ovsa_example.svg
rename to docs/articles_en/assets/images/ovsa_example.svg
diff --git a/docs/sphinx_setup/_static/images/performance_benchmarks_ovms_02.png b/docs/articles_en/assets/images/performance_benchmarks_ovms_02.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/performance_benchmarks_ovms_02.png
rename to docs/articles_en/assets/images/performance_benchmarks_ovms_02.png
diff --git a/docs/sphinx_setup/_static/images/preprocess_not_fit.png b/docs/articles_en/assets/images/preprocess_not_fit.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/preprocess_not_fit.png
rename to docs/articles_en/assets/images/preprocess_not_fit.png
diff --git a/docs/sphinx_setup/_static/images/qdq_propagation.png b/docs/articles_en/assets/images/qdq_propagation.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/qdq_propagation.png
rename to docs/articles_en/assets/images/qdq_propagation.png
diff --git a/docs/sphinx_setup/_static/images/quantization_picture.svg b/docs/articles_en/assets/images/quantization_picture.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/quantization_picture.svg
rename to docs/articles_en/assets/images/quantization_picture.svg
diff --git a/docs/sphinx_setup/_static/images/IE_PLUGIN_DG/images/quantized_convolution.png b/docs/articles_en/assets/images/quantized_convolution.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/IE_PLUGIN_DG/images/quantized_convolution.png
rename to docs/articles_en/assets/images/quantized_convolution.png
diff --git a/docs/sphinx_setup/_static/images/IE_PLUGIN_DG/images/quantized_model_example.png b/docs/articles_en/assets/images/quantized_model_example.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/IE_PLUGIN_DG/images/quantized_model_example.png
rename to docs/articles_en/assets/images/quantized_model_example.png
diff --git a/docs/sphinx_setup/_static/images/small_IR_graph_demonstration.png b/docs/articles_en/assets/images/small_IR_graph_demonstration.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/small_IR_graph_demonstration.png
rename to docs/articles_en/assets/images/small_IR_graph_demonstration.png
diff --git a/docs/sphinx_setup/_static/images/stateful_model_example.svg b/docs/articles_en/assets/images/stateful_model_example.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/stateful_model_example.svg
rename to docs/articles_en/assets/images/stateful_model_example.svg
diff --git a/docs/sphinx_setup/_static/images/stateful_model_init_subgraph.svg b/docs/articles_en/assets/images/stateful_model_init_subgraph.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/stateful_model_init_subgraph.svg
rename to docs/articles_en/assets/images/stateful_model_init_subgraph.svg
diff --git a/docs/sphinx_setup/_static/images/step2_markup1.svg b/docs/articles_en/assets/images/step2_markup1.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step2_markup1.svg
rename to docs/articles_en/assets/images/step2_markup1.svg
diff --git a/docs/sphinx_setup/_static/images/step2_markup2.svg b/docs/articles_en/assets/images/step2_markup2.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step2_markup2.svg
rename to docs/articles_en/assets/images/step2_markup2.svg
diff --git a/docs/sphinx_setup/_static/images/step2_markup3.svg b/docs/articles_en/assets/images/step2_markup3.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step2_markup3.svg
rename to docs/articles_en/assets/images/step2_markup3.svg
diff --git a/docs/sphinx_setup/_static/images/step2_markup4.svg b/docs/articles_en/assets/images/step2_markup4.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step2_markup4.svg
rename to docs/articles_en/assets/images/step2_markup4.svg
diff --git a/docs/sphinx_setup/_static/images/step2_markup5.svg b/docs/articles_en/assets/images/step2_markup5.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step2_markup5.svg
rename to docs/articles_en/assets/images/step2_markup5.svg
diff --git a/docs/sphinx_setup/_static/images/step2_markup6.svg b/docs/articles_en/assets/images/step2_markup6.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step2_markup6.svg
rename to docs/articles_en/assets/images/step2_markup6.svg
diff --git a/docs/sphinx_setup/_static/images/step2_markup7.svg b/docs/articles_en/assets/images/step2_markup7.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step2_markup7.svg
rename to docs/articles_en/assets/images/step2_markup7.svg
diff --git a/docs/sphinx_setup/_static/images/step2_markup_original.svg b/docs/articles_en/assets/images/step2_markup_original.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step2_markup_original.svg
rename to docs/articles_en/assets/images/step2_markup_original.svg
diff --git a/docs/sphinx_setup/_static/images/step3_original.svg b/docs/articles_en/assets/images/step3_original.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step3_original.svg
rename to docs/articles_en/assets/images/step3_original.svg
diff --git a/docs/sphinx_setup/_static/images/step3_transformed.svg b/docs/articles_en/assets/images/step3_transformed.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/step3_transformed.svg
rename to docs/articles_en/assets/images/step3_transformed.svg
diff --git a/docs/sphinx_setup/_static/images/img/subgraphs_dumper_arch_diaram.png b/docs/articles_en/assets/images/subgraphs_dumper_arch_diaram.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/img/subgraphs_dumper_arch_diaram.png
rename to docs/articles_en/assets/images/subgraphs_dumper_arch_diaram.png
diff --git a/docs/sphinx_setup/_static/images/synch-vs-asynch.svg b/docs/articles_en/assets/images/synch-vs-asynch.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/synch-vs-asynch.svg
rename to docs/articles_en/assets/images/synch-vs-asynch.svg
diff --git a/docs/sphinx_setup/_static/images/tokenization.svg b/docs/articles_en/assets/images/tokenization.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/tokenization.svg
rename to docs/articles_en/assets/images/tokenization.svg
diff --git a/docs/sphinx_setup/_static/images/torch_compile_backend_openvino.svg b/docs/articles_en/assets/images/torch_compile_backend_openvino.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/torch_compile_backend_openvino.svg
rename to docs/articles_en/assets/images/torch_compile_backend_openvino.svg
diff --git a/docs/sphinx_setup/_static/images/training_extensions_framework.png b/docs/articles_en/assets/images/training_extensions_framework.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/training_extensions_framework.png
rename to docs/articles_en/assets/images/training_extensions_framework.png
diff --git a/docs/sphinx_setup/_static/images/transformations_structure.png b/docs/articles_en/assets/images/transformations_structure.png
similarity index 100%
rename from docs/sphinx_setup/_static/images/transformations_structure.png
rename to docs/articles_en/assets/images/transformations_structure.png
diff --git a/docs/sphinx_setup/_static/images/view_on_github.svg b/docs/articles_en/assets/images/view_on_github.svg
similarity index 100%
rename from docs/sphinx_setup/_static/images/view_on_github.svg
rename to docs/articles_en/assets/images/view_on_github.svg
diff --git a/docs/articles_en/documentation/legacy-features/install-dev-tools.rst b/docs/articles_en/documentation/legacy-features/install-dev-tools.rst
index 59e97b69ab7444..6466eb8711a381 100644
--- a/docs/articles_en/documentation/legacy-features/install-dev-tools.rst
+++ b/docs/articles_en/documentation/legacy-features/install-dev-tools.rst
@@ -153,7 +153,7 @@ For example, to install and configure dependencies required for working with Ten
 For more details on the openvino-dev PyPI package, see `pypi.org <https://pypi.org/project/openvino-dev/2023.2.0>`__ .
 
 Step 5. Test the Installation
-+++++++++++++++++++++++++++++
+------------------------------
 
 To verify the package is properly installed, run the command below (this may take a few seconds):
 
@@ -173,7 +173,7 @@ Learn more about OpenVINO and use it in your own application by trying out some
 Get started with Python
 +++++++++++++++++++++++
 
-.. image:: ../../_static/images/get_started_with_python.gif
+.. image:: ../../assets/images/get_started_with_python.gif
   :width: 400
 
 Try the `Python Quick Start Example <../../notebooks/vision-monodepth-with-output.html>`__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser.
@@ -187,7 +187,7 @@ Visit the :doc:`Tutorials <../../learn-openvino/interactive-tutorials-python>` p
 Get started with C++
 ++++++++++++++++++++
 
-.. image:: ../../_static/images/get_started_with_cpp.jpg
+.. image:: ../../assets/images/get_started_with_cpp.jpg
   :width: 400
 
 
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst
index 917998c7ebaf9c..e2099fdc2b0562 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst
@@ -43,14 +43,14 @@ The input model is converted as a whole if neither ``input`` nor ``output`` comm
 
 For Inception_V1, there is one ``Placeholder``: input. If the model is viewed in TensorBoard, the input operation is easy to find:
 
-.. image:: ../../../../_static/images/inception_v1_std_input.svg
+.. image:: ../../../../assets/images/inception_v1_std_input.svg
    :alt: Placeholder in Inception V1
 
 ``Reshape`` is the only output operation, which is enclosed in a nested name scope of ``InceptionV1/Logits/Predictions``, under the full name of ``InceptionV1/Logits/Predictions/Reshape_1``.
 
 In TensorBoard, along with some of its predecessors, it looks as follows:
 
-.. image:: ../../../../_static/images/inception_v1_std_output.svg
+.. image:: ../../../../assets/images/inception_v1_std_output.svg
    :alt: TensorBoard with predecessors
 
 Convert this model to ``ov.Model``:
@@ -150,7 +150,7 @@ Model Cutting
 
 Now, consider how to cut some parts of the model off. This chapter describes the first convolution block ``InceptionV1/InceptionV1/Conv2d_1a_7x7`` of the Inception V1 model to illustrate cutting:
 
-.. image:: ../../../../_static/images/inception_v1_first_block.svg
+.. image:: ../../../../assets/images/inception_v1_first_block.svg
    :alt: Inception V1 first convolution block
 
 Cutting at the End
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst
index 65af9f6322fb12..71c28a5db9205d 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst
@@ -77,7 +77,7 @@ Refer to the :doc:`Using Shape Inference <../../../../../../openvino-workflow/ru
 The second is that the frozen model still has two variables: ``previous_state_c`` and ``previous_state_h``, figure
 with the frozen *.pb model is below. It means that the model keeps training these variables at each inference.
 
-.. image:: ./../../../../../../_static/images/DeepSpeech-0.8.2.png
+.. image:: ../../../../../../assets/images/DeepSpeech-0.8.2.png
 
 At the first inference, the variables are initialized with zero tensors. After execution, the results of the ``BlockLSTM``
 are assigned to cell state and hidden state, which are these two variables.
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst
index 3d191b0859a2bf..3dc28444781b1a 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst
@@ -5,15 +5,15 @@ Converting TensorFlow FaceNet Models
 
 
 .. meta::
-   :description: Learn how to convert a FaceNet model 
+   :description: Learn how to convert a FaceNet model
                  from TensorFlow to the OpenVINO Intermediate Representation.
 
 .. danger::
 
    The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications.
 
-   This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Supported Model Formats <../../../../../../openvino-workflow/model-preparation>` article. 
-   
+   This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Supported Model Formats <../../../../../../openvino-workflow/model-preparation>` article.
+
 `Public pre-trained FaceNet models <https://github.com/davidsandberg/facenet#pre-trained-models>`__ contain both training
 and inference part of graph. Switch between this two states is manageable with placeholder value.
 Intermediate Representation (IR) models are intended for inference, which means that train part is redundant.
@@ -21,7 +21,7 @@ Intermediate Representation (IR) models are intended for inference, which means
 There are two inputs in this network: boolean ``phase_train`` which manages state of the graph (train/infer) and
 ``batch_size`` which is a part of batch joining pattern.
 
-.. image:: ./../../../../../../_static/images/FaceNet.svg
+.. image:: ../../../../../../assets/images/FaceNet.svg
 
 Converting a TensorFlow FaceNet Model to the IR
 ###############################################
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst
index 5b0b97f42e78de..248d41f7eea4a7 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst
@@ -92,7 +92,7 @@ Once you have downloaded the pretrained model files, you will have the ``lm_1b``
 
 
 
-.. image:: ./../../../../../../_static/images/lm_1b.svg
+.. image:: ../../../../../../assets/images/lm_1b.svg
 
 The frozen model still has two variables: ``Variable`` and ``Variable_1``.
 It means that the model keeps training those variables at each inference.
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst
index 111549f2afd6b5..5a944288906b14 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst
@@ -43,7 +43,7 @@ This tutorial explains how to convert Neural Collaborative Filtering (NCF) model
 
 3. Convert the model to the OpenVINO format. If you look at your frozen model, you can see that it has one input that is split into four ``ResourceGather`` layers. (Click image to zoom in.)
 
-   .. image::  ./../../../../../../_static/images/NCF_start.svg
+   .. image::  ../../../../../../assets/images/NCF_start.svg
 
    However, as the model conversion API does not support such data feeding, you should skip it. Cut
    the edges incoming in ``ResourceGather`` port 1:
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst
index 18cc42f36ad6ec..66a8f4563bc9ef 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst
@@ -48,7 +48,7 @@ To fix some operators which prevent normal shape propagation:
   With ``->[0 -1]``, this new ``Parameter`` is replaced by a ``Constant`` operator which has the ``[0, -1]`` value.
   Since the ``Reshape`` operator has ``0`` and ``-1`` as specific values, it allows propagating shapes freely without losing the intended meaning of ``Reshape``.   For more information, see :doc:`the specification <../../../openvino-ir-format/operation-sets/operation-specs/shape/reshape-1>`.
 
-  .. image:: ../../../../_static/images/batch_relaxation.png
+  .. image:: ../../../../assets/images/batch_relaxation.png
 
 * transform the model conversion on the back phase. For more information, see the :doc:`How to Convert a Model <../legacy-model-optimizer-extensibility>`,
 * transform OpenVINO Model during the runtime. For more information, see :doc:`OpenVINO Runtime Transformations <../../../openvino-extensibility/transformation-api>`,
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst
index 965e6be70f4c80..1ecf2a55b94ec1 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst
@@ -83,7 +83,7 @@ Model Conversion Pipeline
 
 A model conversion pipeline can be represented with the following diagram:
 
-.. image:: ../../../_static/images/MO_conversion_pipeline.svg
+.. image:: ../../../assets/images/MO_conversion_pipeline.svg
 
 Each conversion step is reviewed in details below.
 
@@ -100,7 +100,7 @@ is a separate loader for each supported framework. These loaders are implemented
 
 The result of a model loading step is a ``Graph`` object, which can be depicted like in the following example:
 
-.. image:: ../../../_static/images/MO_graph_after_loader.svg
+.. image:: ../../../assets/images/MO_graph_after_loader.svg
 
 Model Optimizer loader saves an operation instance framework description (usually it is a Protobuf message) into a node
 attribute usually with a name ``pb`` for each operation of an input model. It is important that this is a
@@ -134,7 +134,7 @@ The extractors execution order is the following:
 
 The result of operations attributes extracting step can be depicted like in the following example:
 
-.. image:: ../../../_static/images/MO_graph_after_extractors.svg
+.. image:: ../../../assets/images/MO_graph_after_extractors.svg
 
 The only difference in the graph from the previous step is that nodes contain dictionary with extracted attributes and
 operation-specific attributes needed for Model Optimizer. However, from this step, Model Optimizer does not
@@ -203,7 +203,7 @@ Model Optimizer does not have value propagation implementation for the operation
 
 Before running partial inference, the graph can be depicted like in the following example:
 
-.. image:: ../../../_static/images/MO_graph_before_partial_inference.svg
+.. image:: ../../../assets/images/MO_graph_before_partial_inference.svg
 
 The difference in a graph structure with a graph during the front phase is not only in the data nodes, but also in the
 edge attributes. Note that an ``out`` attribute is specified for edges **from operation** nodes only, while an ``in``
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst
index 1c8aa73b014cbd..b2be35f4452832 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst
@@ -95,7 +95,7 @@ port with ``idx = 2`` corresponds to the incoming edge of a node with an attribu
 Consider the example of a graph part with 4 operation nodes "Op1", "Op2", "Op3", and "Op4" and a number of data nodes
 depicted with light green boxes.
 
-.. image:: ../../../../_static/images/MO_ports_example_1.svg
+.. image:: ../../../../assets/images/MO_ports_example_1.svg
    :scale: 80 %
    :align: center
 
@@ -132,7 +132,7 @@ For example, applying the following two methods to the graph above will result i
    op4.in_port(1).disconnect()
    op3.out_port(0).connect(op4.in_port(1))
 
-.. image:: ../../../../_static/images/MO_ports_example_2.svg
+.. image:: ../../../../assets/images/MO_ports_example_2.svg
    :scale: 80 %
    :align: center
 
@@ -165,7 +165,7 @@ example, the function call ``op3.out_port(0).get_connection().set_source(op1.out
 consuming data from port ``op3.out_port(0)`` to ``op1.out_port(0)``. The transformed graph from the sample above is depicted
 below:
 
-.. image:: ../../../../_static/images/MO_connection_example_1.svg
+.. image:: ../../../../assets/images/MO_connection_example_1.svg
    :scale: 80 %
    :align: center
 
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst
index 39162e5c6fc78a..ea6f51aa61e227 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst
@@ -34,7 +34,7 @@ order. To execute the transformation during a proper model conversion phase, Mod
 anchor transformations that do nothing. All transformations are ordered with respect to these anchor transformations.
 The diagram below shows anchor transformations, some of built-in transformations and dependencies between them:
 
-.. image:: ../../../../../_static/images/MO_transformations_graph.svg
+.. image:: ../../../../../assets/images/MO_transformations_graph.svg
 
 User-defined transformations are executed after the corresponding ``Start`` and before the corresponding ``Finish`` anchor
 transformations by default (if ``run_before()`` and ``run_after()`` methods have not been overridden).
diff --git a/docs/articles_en/documentation/openvino-ecosystem/datumaro.rst b/docs/articles_en/documentation/openvino-ecosystem/datumaro.rst
index 2aa60cc18fb61a..c6b257ae3f17ca 100644
--- a/docs/articles_en/documentation/openvino-ecosystem/datumaro.rst
+++ b/docs/articles_en/documentation/openvino-ecosystem/datumaro.rst
@@ -21,7 +21,9 @@ Plus, enjoy `Jupyter notebooks <https://github.com/openvinotoolkit/datumaro/tree
 Detailed Workflow
 #################
 
-.. image:: ./../../_static/images/datumaro.png
+``
+
+.. image:: ../../assets/images/datumaro.png
 
 1. To start working with Datumaro, download public datasets or prepare your own annotated dataset.
 
diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst
index d0966d1749b7a6..a261849c11315c 100644
--- a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst
+++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst
@@ -47,7 +47,7 @@ The OpenVINO™ Security Add-on consists of three components that run in Kernel-
 
 **Where the OpenVINO™ Security Add-on Fits into Model Development and Deployment**
 
-.. image:: ../../_static/images/ovsa_diagram.svg
+.. image:: ../../assets/images/ovsa_diagram.svg
 
 The binding between SWTPM (vTPM used in guest VM) and HW TPM (TPM on the host) is explained in `this document. <https://github.com/openvinotoolkit/security_addon/blob/master/docs/fingerprint-changes.md>`__
 
@@ -142,9 +142,9 @@ Begin this step on the Intel® Core™ or Xeon® processor machine that meets th
 
 3. Install the Kernel-based Virtual Machine (KVM) and QEMU packages.
 
-	.. code-block:: sh
+   .. code-block:: sh
 
-	   sudo apt install qemu qemu-kvm libvirt-bin  bridge-utils  virt-manager
+      sudo apt install qemu qemu-kvm libvirt-bin  bridge-utils  virt-manager
 
 
 4. Check the QEMU version:
@@ -288,16 +288,16 @@ This example in this step uses the following names. Your configuration might use
 
 10. Create a script named ``virbr0-qemu-ifdown`` to bring down the ``virbr0`` interface. Add the following script contents:
 
-   .. code-block:: sh
+    .. code-block:: sh
 
-      #!/bin/sh
-      nic=$1
-      if [ -f /etc/default/qemu-kvm ]; then
-      . /etc/default/qemu-kvm
-      fi
-      switch=virbr0
-      brctl delif $switch $nic
-      ifconfig $nic 0.0.0.0 down
+       #!/bin/sh
+       nic=$1
+       if [ -f /etc/default/qemu-kvm ]; then
+       . /etc/default/qemu-kvm
+       fi
+       switch=virbr0
+       brctl delif $switch $nic
+       ifconfig $nic 0.0.0.0 down
 
 
 See the QEMU documentation for more information about the QEMU network configuration.
@@ -390,43 +390,43 @@ As an option, you can use ``virsh`` and the virtual machine manager to create an
 
 10. Start the vTPM on Host, write the HW TPM data into its NVRAM and restart the vTPM for QEMU:
 
-   .. code-block:: sh
+    .. code-block:: sh
 
-      sudo swtpm socket --tpm2 --server port=8280 \
-                        --ctrl type=tcp,port=8281 \
-                        --flags not-need-init --tpmstate dir=/var/OVSA/vtpm/vtpm_isv_dev &
+       sudo swtpm socket --tpm2 --server port=8280 \
+                         --ctrl type=tcp,port=8281 \
+                         --flags not-need-init --tpmstate dir=/var/OVSA/vtpm/vtpm_isv_dev &
 
-      sudo tpm2_startup --clear -T swtpm:port=8280
-      sudo tpm2_startup -T swtpm:port=8280
-      python3 <path to Security-Addon source>/Scripts/host/OVSA_write_hwquote_swtpm_nvram.py 8280
-      sudo pkill -f vtpm_isv_dev
+       sudo tpm2_startup --clear -T swtpm:port=8280
+       sudo tpm2_startup -T swtpm:port=8280
+       python3 <path to Security-Addon source>/Scripts/host/OVSA_write_hwquote_swtpm_nvram.py 8280
+       sudo pkill -f vtpm_isv_dev
 
-     swtpm socket --tpmstate dir=/var/OVSA/vtpm/vtpm_isv_dev \
-      --tpm2 \
-      --ctrl type=unixio,path=/var/OVSA/vtpm/vtpm_isv_dev/swtpm-sock \
-      --log level=20
+      swtpm socket --tpmstate dir=/var/OVSA/vtpm/vtpm_isv_dev \
+       --tpm2 \
+       --ctrl type=unixio,path=/var/OVSA/vtpm/vtpm_isv_dev/swtpm-sock \
+       --log level=20
 
 
 11. Start the Guest VM:
 
-   .. code-block:: sh
-
-      sudo qemu-system-x86_64 \
-       -cpu host \
-       -enable-kvm \
-       -m 8192 \
-       -smp 8,sockets=1,cores=8,threads=1 \
-       -device e1000,netdev=hostnet0,mac=52:54:00:d1:66:6f \
-       -netdev tap,id=hostnet0,script=<path-to-scripts>/br0-qemu-ifup,downscript=<path-to-scripts>/br0-qemu-ifdown \
-       -device e1000,netdev=hostnet1,mac=52:54:00:d1:66:5f \
-       -netdev tap,id=hostnet1,script=<path-to-scripts>/virbr0-qemu-ifup,downscript=<path-to-scripts>/virbr0-qemu-ifdown \
-       -drive if=virtio,file=<path-to-disk-image>/ovsa_isv_dev_vm_disk.qcow2,cache=none \
-       -chardev socket,id=chrtpm,path=/var/OVSA/vtpm/vtpm_isv_dev/swtpm-sock \
-       -tpmdev emulator,id=tpm0,chardev=chrtpm \
-       -device tpm-tis,tpmdev=tpm0 \
-       -vnc :1
+    .. code-block:: sh
 
-   Use the QEMU runtime options in the command to change the memory amount or CPU assigned to this Guest VM.
+       sudo qemu-system-x86_64 \
+        -cpu host \
+        -enable-kvm \
+        -m 8192 \
+        -smp 8,sockets=1,cores=8,threads=1 \
+        -device e1000,netdev=hostnet0,mac=52:54:00:d1:66:6f \
+        -netdev tap,id=hostnet0,script=<path-to-scripts>/br0-qemu-ifup,downscript=<path-to-scripts>/br0-qemu-ifdown \
+        -device e1000,netdev=hostnet1,mac=52:54:00:d1:66:5f \
+        -netdev tap,id=hostnet1,script=<path-to-scripts>/virbr0-qemu-ifup,downscript=<path-to-scripts>/virbr0-qemu-ifdown \
+        -drive if=virtio,file=<path-to-disk-image>/ovsa_isv_dev_vm_disk.qcow2,cache=none \
+        -chardev socket,id=chrtpm,path=/var/OVSA/vtpm/vtpm_isv_dev/swtpm-sock \
+        -tpmdev emulator,id=tpm0,chardev=chrtpm \
+        -device tpm-tis,tpmdev=tpm0 \
+        -vnc :1
+
+    Use the QEMU runtime options in the command to change the memory amount or CPU assigned to this Guest VM.
 
 12. Use a VNC client to log on to the Guest VM at ``<host-ip-address>:1``
 
@@ -701,9 +701,9 @@ The Model Hosting components install the OpenVINO™ Security Add-on Runtime Doc
 1. Log on to the Guest VM as ``<user>``.
 2. Create the OpenVINO™ Security Add-on directory in the home directory
 
-    .. code-block:: sh
+   .. code-block:: sh
 
-       mkdir -p ~/OVSA
+      mkdir -p ~/OVSA
 
 3. While on the Host Machine copy the ovsa-model-hosting.tar.gz from release_files to the Guest VM:
 
@@ -744,7 +744,7 @@ The following figure describes the interactions between the Model Developer, Ind
 
    The Model Developer/Independent Software Vendor and User roles are related to virtual machine use and one person might fill the tasks required by multiple roles. In this document the tasks of Model Developer and Independent Software Vendor are combined and use the Guest VM named ``ovsa_isv``. It is possible to have all roles set up on the same Host Machine.
 
-.. image:: ../../_static/images/ovsa_example.svg
+.. image:: ../../assets/images/ovsa_example.svg
 
 Model Developer Instructions
 ++++++++++++++++++++++++++++
@@ -770,7 +770,8 @@ Step 2: Create a key store and add a certificate to it
 ------------------------------------------------------
 
 1. Create files to request a certificate:
-This example uses a self-signed certificate for demonstration purposes. In a production environment, use CSR files to request for a CA-signed certificate.
+
+   This example uses a self-signed certificate for demonstration purposes. In a production environment, use CSR files to request for a CA-signed certificate.
 
    .. code-block:: sh
 
@@ -869,8 +870,8 @@ Step 7: Receive a User Request
 
 5. Provide these files to the User:
 
-	* ``face_detection_model.dat``
-	* ``face_detection_model.lic``
+   * ``face_detection_model.dat``
+   * ``face_detection_model.lic``
 
 Model User Instructions
 +++++++++++++++++++++++
@@ -988,9 +989,9 @@ Step 5: Start the NGINX Model Server
 
 The NGINX Model Server publishes the access controlled model.
 
-   .. code-block:: sh
+.. code-block:: sh
 
-      ./start_secure_ovsa_model_server.sh
+   ./start_secure_ovsa_model_server.sh
 
 For information about the NGINX interface follow `here <https://github.com/openvinotoolkit/model_server/blob/main/extras/nginx-mtls-auth/README.md>`__.
 
@@ -1051,6 +1052,7 @@ References
 ##########
 
 Use these links for more information:
+
 - `OpenVINO toolkit <https://software.intel.com/en-us/openvino-toolkit>`__
 - `OpenVINO Model Server Quick Start Guide <https://github.com/openvinotoolkit/model_server/blob/main/docs/ovms_quickstart.md>`__
 - `Model repository <https://github.com/openvinotoolkit/model_server/blob/main/docs/models_repository.md>`__
diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst
index 76bb29e7925a32..c261c6dd06ce0a 100644
--- a/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst
+++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst
@@ -18,7 +18,7 @@ inference. It allows you to export and convert the models to the needed format.
 Detailed Workflow
 #################
 
-.. image:: ./../../_static/images/training_extensions_framework.png
+.. image:: ../../assets/images/training_extensions_framework.png
 
 1. To start working with OpenVINO Training Extensions, prepare and annotate your dataset. For example, on CVAT.
 
diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst
index df0edd1f038135..c8e041e5a367e9 100644
--- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst
@@ -79,7 +79,7 @@ If operation is not supported by LPT then dequantization operation will not be p
 
 For example, if you would like to infer a model with ``Convolution`` operation in low precision then the model can look as on picture below:
 
-.. image:: ../../../../_static/images/model_fq_and_convolution.common.svg
+.. image:: ../../../../assets/images/model_fq_and_convolution.common.svg
    :alt: Quantized Convolution
 
 There are several supported quantization approaches on activations and on weights. All supported approaches are described in `Quantization approaches <#quantization-approaches>`__ section below. In demonstrated model `FakeQuantize operation quantization <#fakequantize-operation>`__ approach is used.
@@ -104,7 +104,7 @@ FakeQuantize operation
 
 In this case ``FakeQuantize`` operation is used on activations and quantized constant on weights. Original input model:
 
-.. image:: ../../../../_static/images/model_fq_and_convolution.common.svg
+.. image:: ../../../../assets/images/model_fq_and_convolution.common.svg
    :alt: Original model with FakeQuantize
 
 
@@ -113,7 +113,7 @@ Quantize and dequantization operations
 
 In this case ``FakeQuantize`` operation and ``Convert`` are used as quantize operation and return quantized low precision tensor. After quantize operation on activations there are ``Convert`` and dequantization operations to compensate decomposition. Original input model:
 
-.. image:: ../../../../_static/images/model_qdq_and_convolution.common.svg
+.. image:: ../../../../assets/images/model_qdq_and_convolution.common.svg
    :alt: Original model with Q/DQ
 
 In both cases result is the same. In LPT result model you can see that:
@@ -129,7 +129,7 @@ In both cases result is the same. In LPT result model you can see that:
 
 LPT result model:
 
-.. image:: ../../../../_static/images/model_fq_and_convolution.transformed.svg
+.. image:: ../../../../assets/images/model_fq_and_convolution.transformed.svg
    :alt: Result model
 
 Low precision transformations pipeline
@@ -137,7 +137,7 @@ Low precision transformations pipeline
 
 LPT transformation pipeline has several steps. For each transformation inside one step pattern matcher is unique per transformation, but each operation can be assigned to several transformations.
 
-.. image:: ../../../../_static/images/low_precision_transformation_pipeline.svg
+.. image:: ../../../../assets/images/low_precision_transformation_pipeline.svg
    :alt: Low precision transformations pipeline
 
 Inside each step LPT transformations handle input model operation by operation, applying transformation matching pattern for each transformation from the step to an operation, and execute transformation if pattern is matched. Decomposition transformation decomposes ``FakeQuantize`` to quantize and dequantization operations. Dequantization operations from previous transformation result is used for the current one and so on, until the end of the model is achieved.
@@ -227,12 +227,12 @@ Decomposition transformations decompose the ``FakeQuantize`` operation to: quant
 
 Original ``FakeQuantize``:
 
-.. image:: ../../../../_static/images/fq.common.svg
+.. image:: ../../../../assets/images/fq.common.svg
    :alt: FakeQuantize operation before LPT
 
 ``FakeQuantize`` after decomposition to quantization and dequantization operations:
 
-.. image:: ../../../../_static/images/fq.transformed.svg
+.. image:: ../../../../assets/images/fq.transformed.svg
    :alt: FakeQuantize operation after LPT
 
 Dequantization operations handling transformations
@@ -242,12 +242,12 @@ In this step, LPT transformations fuse dequantization operations or move them th
 
 Original ``Convolution`` operation in FP32 with dequantization operations before:
 
-.. image:: ../../../../_static/images/model_fq_and_convolution.common.svg
+.. image:: ../../../../assets/images/model_fq_and_convolution.common.svg
    :alt: Convolution operation before LPT
 
 ``Convolution`` operation in INT8 after decomposition and dequantization operations handling:
 
-.. image:: ../../../../_static/images/model_fq_and_convolution.transformed.svg
+.. image:: ../../../../assets/images/model_fq_and_convolution.transformed.svg
    :alt: Convolution operation after LPT
 
 
@@ -270,12 +270,12 @@ There are more details in developer guide :doc:`Cleanup transformations <low-pre
 
 ``FakeQuantize`` operation with not handled dequantization operations:
 
-.. image:: ../../../../_static/images/fq.transformed.svg
+.. image:: ../../../../assets/images/fq.transformed.svg
    :alt: TODO: FakeQuantize operation with dequantization operations before LPT
 
 ``FakeQuantize`` operation with fused dequantization operations:
 
-.. image:: ../../../../_static/images/fq.common.svg
+.. image:: ../../../../assets/images/fq.common.svg
    :alt: TODO: FakeQuantize operation with fused operations after LPT
 
 
diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step2-markup.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step2-markup.rst
index 9c21b4a3492d28..073ccc1c3d0da4 100644
--- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step2-markup.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step2-markup.rst
@@ -83,7 +83,7 @@ Common markup transformations can be decomposed into simpler utility markup tran
 
 Let's explore all transformations and their relations in detail, using one and the same model:
 
-.. image:: ../../../../../_static/images/step2_markup_original.svg
+.. image:: ../../../../../assets/images/step2_markup_original.svg
 
 The original model key features:
 
@@ -114,7 +114,7 @@ Changes in the example model after ``MarkupCanBeQuantized`` transformation:
 
 Result model:
 
-.. image:: ../../../../../_static/images/step2_markup1.svg
+.. image:: ../../../../../assets/images/step2_markup1.svg
    :alt: MarkupCanBeQuantize
 
 Model display features (here and below):
@@ -137,7 +137,7 @@ No attributes are required before the transformation. Changes in the example mod
 
 Result model:
 
-.. image:: ../../../../../_static/images/step2_markup2.svg
+.. image:: ../../../../../assets/images/step2_markup2.svg
    :alt: MarkupPrecisions result
 
 3. MarkupPerTensorQuantization
@@ -151,7 +151,7 @@ Changes in the example model after ``MarkupPerTensorQuantization`` transformatio
 
 Result model:
 
-.. image:: ../../../../../_static/images/step2_markup3.svg
+.. image:: ../../../../../assets/images/step2_markup3.svg
    :alt: MarkupPerTensorQuantization result
 
 4. MarkupAvgPoolPrecisionPreserved
@@ -169,7 +169,7 @@ Changes in the example model after ``MarkupAvgPoolPrecisionPreserved`` transform
 
 Result model:
 
-.. image:: ../../../../../_static/images/step2_markup4.svg
+.. image:: ../../../../../assets/images/step2_markup4.svg
    :alt: arkupAvgPoolPrecisionPreserved
 
 5. PropagatePrecisions
@@ -188,7 +188,7 @@ Changes in the example model after ``PropagatePrecisions`` transformation:
 
 Result model:
 
-.. image:: ../../../../../_static/images/step2_markup5.svg
+.. image:: ../../../../../assets/images/step2_markup5.svg
    :alt: PropagatePrecisions
 
 .. note::
@@ -208,7 +208,7 @@ Changes in the example model after ``AlignQuantizationIntervals`` transformation
 
 Result model:
 
-.. image:: ../../../../../_static/images/step2_markup6.svg
+.. image:: ../../../../../assets/images/step2_markup6.svg
    :alt: AlignQuantizationIntervals
 
 7. AlignQuantizationParameters
@@ -227,6 +227,6 @@ Changes in the example model after ``AlignQuantizationParameters`` transformatio
 
 Final model:
 
-.. image:: ../../../../../_static/images/step2_markup7.svg
+.. image:: ../../../../../assets/images/step2_markup7.svg
    :alt: AlignQuantizationParameters
 
diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main.rst
index 66c46124e1c1a2..bd5c449877ad87 100644
--- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main.rst
@@ -88,12 +88,12 @@ Main transformations are the majority of low precision transformations. Transfor
 
 Let's explore some main transformations on the example model. Original model:
 
-.. image:: ../../../../../_static/images/step3_original.svg
+.. image:: ../../../../../assets/images/step3_original.svg
    :alt: Original model
 
 Result model after main transformations:
 
-.. image:: ../../../../../_static/images/step3_transformed.svg
+.. image:: ../../../../../assets/images/step3_transformed.svg
    :alt: Transformed model
 
 Changes in the example model after main transformation:
diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main/arithmetic/add.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main/arithmetic/add.rst
index 9938c3c1409ddd..07aa06a5e403b3 100644
--- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main/arithmetic/add.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main/arithmetic/add.rst
@@ -41,7 +41,7 @@ The subgraph with quantized `Add` operation before transformation:
     y_{ch,i}=(scale1_{ch} * (x1_{ch,i} - shift1_{ch})) + (scale2_{ch} * (x2_{ch,i} - shift2_{ch}))
 
 
-.. image:: /../../../docs/sphinx_setup/_static/images/add.common.png
+.. image:: ../../../../../../../assets/images/add.common.png
 
 
 Subgraph after transformation
@@ -66,4 +66,4 @@ where:
     shift1_{ch}' = shift1_{ch} + scale2_{ch} * shift2_{ch} / scale1_{ch}
 
 
-.. image::  /../../../docs/sphinx_setup/_static/images/add.transformed.png
\ No newline at end of file
+.. image::../../../../../../../assets/images/add.transformed.png
\ No newline at end of file
diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main/convolution/convolution.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main/convolution/convolution.rst
index 2d91ea87b2cca6..9ba3fecb059644 100644
--- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main/convolution/convolution.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations/step3-main/convolution/convolution.rst
@@ -29,18 +29,18 @@ Quantized weights in low precision with dequantization operations
 
 The subgraph with quantized ``Convolution`` before transformation with quantized weights in low precision constant and dequantization operations:
 
-.. image:: /../../../docs/sphinx_setup/_static/images/fq_and_convolution.common.png
+.. image:: ../../../../../../../assets/images/fq_and_convolution.common.png
 
 Weights in original precision with FakeQuantize operation
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 The subgraph with quantized ``Convolution`` before transformation with weights in original precision and `FakeQuantize` operation:
 
-.. image:: /../../../docs/sphinx_setup/_static/images/fq_fq_and_convolution.common.png
+.. image:: ../../../../../../../assets/images/fq_fq_and_convolution.common.png
 
 Subgraph after transformation
 #############################################
 
 The subgraph with `Convolution` operation after the transformation:
 
-.. image:: /../../../docs/sphinx_setup/_static/images/fq_and_convolution.transformed.png
\ No newline at end of file
+.. image:: ../../../../../../../assets/images/fq_and_convolution.transformed.png
\ No newline at end of file
diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models.rst
index 9489ec6b84ce50..fadd846af973dc 100644
--- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models.rst
@@ -10,13 +10,13 @@ Quantized models compute and restrictions
    quantized-models/low-precision-model-representation
 
 .. meta::
-   :description: Learn about the support for quantized models with different 
-                 precisions and the FakeQuantize operation used to express 
+   :description: Learn about the support for quantized models with different
+                 precisions and the FakeQuantize operation used to express
                  quantization rules.
 
 One of the feature of OpenVINO is the support of quantized models with different precisions: INT8, INT4, etc.
 However, it is up to the plugin to define what exact precisions are supported by the particular HW.
-All quantized models which can be expressed in IR have a unified representation by means of *FakeQuantize* operation. 
+All quantized models which can be expressed in IR have a unified representation by means of *FakeQuantize* operation.
 For more details about low-precision model representation please refer to this :doc:`document <quantized-models/low-precision-model-representation>`.
 
 Interpreting FakeQuantize at runtime
@@ -27,45 +27,45 @@ During the model load each plugin can interpret quantization rules expressed in
 * Independently based on the definition of *FakeQuantize* operation.
 * Using a special library of low-precision transformations (LPT) which applies common rules for generic operations, such as Convolution, Fully-Connected, Eltwise, etc., and translates "fake-quantized" models into models with low-precision operations.
 
-Here we provide only a high-level overview of the interpretation rules of FakeQuantize. 
-At runtime each FakeQuantize can be split into two independent operations: **Quantize** and **Dequantize**. 
-The former one is aimed to transform the input data into the target precision while the latter transforms the resulting values back to the original range and precision. 
-In practice *Dequantize* operations can be propagated forward through the linear operations, such as *Convolution* or *Fully-Connected*, 
+Here we provide only a high-level overview of the interpretation rules of FakeQuantize.
+At runtime each FakeQuantize can be split into two independent operations: **Quantize** and **Dequantize**.
+The former one is aimed to transform the input data into the target precision while the latter transforms the resulting values back to the original range and precision.
+In practice *Dequantize* operations can be propagated forward through the linear operations, such as *Convolution* or *Fully-Connected*,
 and in some cases fused with the following *Quantize* operation for the next layer into the so-called *Requantize* operation (see Fig. 1).
 
-.. image:: ../../../../_static/images/qdq_propagation.png 
+.. image:: ../../../../assets/images/qdq_propagation.png
 
 Figure 1. Quantization operations propagation at runtime. Q, DQ, RQ stand for Quantize, Dequantize, and Requantize correspondingly.
 
-From the calculation standpoint, the FakeQuantize formula also is split into two parts accordingly:  
+From the calculation standpoint, the FakeQuantize formula also is split into two parts accordingly:
 
 ``output = round((x - input_low) / (input_high - input_low) * (levels-1)) / (levels-1) * (output_high - output_low) + output_low``
 
-The first part of this formula represents *Quantize* operation:  
+The first part of this formula represents *Quantize* operation:
 
-``q = round((x - input_low) / (input_high - input_low) * (levels-1))``  
+``q = round((x - input_low) / (input_high - input_low) * (levels-1))``
 
-The second is responsible for the dequantization:  
+The second is responsible for the dequantization:
 
-``r = q / (levels-1) * (output_high - output_low) + output_low``  
+``r = q / (levels-1) * (output_high - output_low) + output_low``
 
-From the scale/zero-point notation standpoint the latter formula can be written as follows:  
+From the scale/zero-point notation standpoint the latter formula can be written as follows:
 
-``r = (output_high - output_low) / (levels-1) * (q + output_low / (output_high - output_low) * (levels-1))``  
+``r = (output_high - output_low) / (levels-1) * (q + output_low / (output_high - output_low) * (levels-1))``
 
 Thus we can define:
 
 * **Scale** as ``(output_high - output_low) / (levels-1)``
 * **Zero-point** as ``-output_low / (output_high - output_low) * (levels-1)``
 
-.. note::  
+.. note::
     During the quantization process the values ``input_low``, ``input_high``, ``output_low``, ``output_high`` are selected so that to map a floating-point zero exactly to an integer value (zero-point) and vice versa.
 
 Quantization specifics and restrictions
 #######################################
 
 In general, OpenVINO can represent and execute quantized models from different sources. However, the Neural Network Compression Framework (NNCF)
-is considered the default way to get optimized models. Since the NNCF supports HW-aware quantization it means that specific rules can be implemented in it for 
+is considered the default way to get optimized models. Since the NNCF supports HW-aware quantization it means that specific rules can be implemented in it for
 the particular HW. However, it is reasonable to have compatibility with general-purpose HW such as CPU and GPU and support their quantization schemes.
 Below we define these rules as follows:
 
@@ -73,6 +73,6 @@ Below we define these rules as follows:
 * Per-channel quantization of weights of Convolutional and Fully-Connected layers.
 * Per-channel quantization of activations for channel-wise and element-wise operations, e.g. Depthwise Convolution, Eltwise Add/Mul, ScaleShift.
 * Symmetric and asymmetric quantization of weights and activations with the support of per-channel scales and zero-points.
-* Non-unified quantization parameters for Eltwise and Concat operations.  
+* Non-unified quantization parameters for Eltwise and Concat operations.
 * Non-quantized network output, i.e. there are no quantization parameters for it.
 
diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models/low-precision-model-representation.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models/low-precision-model-representation.rst
index 2435aebd6a4242..abecc2cfa8f580 100644
--- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models/low-precision-model-representation.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models/low-precision-model-representation.rst
@@ -24,7 +24,7 @@ In order to be able to execute a particular DL operation in low-precision all it
 between operation and data blobs.  The figure below shows an example of quantized Convolution which contains two FakeQuantize nodes: one for
 weights and one for activations (bias is quantized using the same parameters).
 
-.. image:: ../../../../../_static/images/IE_PLUGIN_DG/images/quantized_convolution.png
+.. image:: ../../../../../assets/images/quantized_convolution.png
 
 
 Starting from OpenVINO 2020.2 release all the quantized models are represented in the compressed form. It means that the weights
@@ -32,4 +32,4 @@ of low-precision operations are converted into the target precision (e.g. INT8).
 The rest of the parameters can be represented in FLOAT32 or FLOAT16 precision depending on the input full-precision model used in
 the quantization process. Fig. 2 below shows an example of the part of the compressed IR.
 
-.. image:: ../../../../../_static/images/IE_PLUGIN_DG/images/quantized_model_example.png
+.. image:: ../../../../../assets/images/quantized_model_example.png
diff --git a/docs/articles_en/documentation/openvino-extensibility/transformation-api.rst b/docs/articles_en/documentation/openvino-extensibility/transformation-api.rst
index 2f6ea47c4441cc..5e28a22e69ab98 100644
--- a/docs/articles_en/documentation/openvino-extensibility/transformation-api.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/transformation-api.rst
@@ -49,7 +49,7 @@ Let's start with OpenVINO™ helper functions. The most popular function is ``ov
 
 We will review real replacement case where Negative operation is replaced with Multiply.
 
-.. image:: ./../../_static/images/ov_replace_node.png
+.. image:: ../../assets/images/ov_replace_node.png
 
 .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp
    :language: cpp
@@ -65,7 +65,7 @@ The alternative way to do the same replacement is the following:
 
 Another transformation example is insertion.
 
-.. image:: ./../../_static/images/ov_insert_node.png
+.. image:: ../../assets/images/ov_insert_node.png
 
 .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp
    :language: cpp
@@ -101,7 +101,7 @@ OpenVINO™ Runtime has three main transformation types:
 * :doc:`Matcher pass <transformation-api/matcher-pass>` - pattern-based transformation approach
 * :doc:`Graph rewrite pass <transformation-api/graph-rewrite-pass>` - container for matcher passes needed for efficient execution
 
-.. image:: ./../../_static/images/transformations_structure.png
+.. image:: ../../assets/images/transformations_structure.png
 
 Transformation conditional compilation
 ######################################
diff --git a/docs/articles_en/documentation/openvino-extensibility/transformation-api/graph-rewrite-pass.rst b/docs/articles_en/documentation/openvino-extensibility/transformation-api/graph-rewrite-pass.rst
index 82057f8cb153d2..e2bc1f8d4bbf00 100644
--- a/docs/articles_en/documentation/openvino-extensibility/transformation-api/graph-rewrite-pass.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/transformation-api/graph-rewrite-pass.rst
@@ -24,12 +24,12 @@ In addition, GraphRewrite handles nodes that were registered by MatcherPasses du
 
 GraphRewrite has two algorithms for MatcherPasses execution. First algorithm is straightforward. It applies each MatcherPass in registration order to current node.
 
-.. image:: ./../../../_static/images/graph_rewrite_execution.png
+.. image:: ../../../assets/images/graph_rewrite_execution.png
 
 But it is not really efficient when you have a lot of registered passes. So first of all GraphRewrite checks that all MatcherPass patterns has type-based root node (it means that type of this node is not hidden into predicate).
 And then creates map from registered MatcherPasses. That helps to avoid additional cost of applying each MatcherPass for each node.
 
-.. image:: ./../../../_static/images/graph_rewrite_efficient_search.png
+.. image:: ../../../assets/images/graph_rewrite_efficient_search.png
 
 .. note::
 
diff --git a/docs/articles_en/documentation/openvino-ir-format/intermediate-representation-int8-inference.rst b/docs/articles_en/documentation/openvino-ir-format/intermediate-representation-int8-inference.rst
index d9557f98827aa0..a0cc4488ef15cc 100644
--- a/docs/articles_en/documentation/openvino-ir-format/intermediate-representation-int8-inference.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/intermediate-representation-int8-inference.rst
@@ -28,7 +28,7 @@ For more details, see the :doc:`specification of FakeQuantize operation <operati
 
 To execute the ``Convolution`` operation in INT8 on CPU, both data and weight inputs should have ``FakeQuantize`` as an input operation:
 
-.. image:: ../../_static/images/expanded_int8_Convolution_weights.png
+.. image:: ../../assets/images/expanded_int8_Convolution_weights.png
 
 
 Low precision IR is also suitable for FP32 and FP16 inference if a chosen plugin supports all operations of the IR. The only difference between a Low Precision IR and FP16 or FP32 IR is the existence of ``FakeQuantize`` in the Low Precision IR.
@@ -48,7 +48,7 @@ Weights compression replaces ``FakeQuantize`` with optional ``Subtract`` and ``M
 
 See the visualization of `Convolution` with the compressed weights:
 
-.. image:: ../../_static/images/compressed_int8_Convolution_weights.png
+.. image:: ../../assets/images/compressed_int8_Convolution_weights.png
 
 Model conversion API generates a compressed IR by default.
 
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets.rst
index a851b514ecf547..7d708f4f247c76 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets.rst
@@ -20,7 +20,7 @@ Input data can be in the form of images, video, text, audio, or preprocessed inf
 
 Here is an illustration of a small graph representing a model that consists of a single Convolutional layer and activation function:
 
-.. image:: ../../_static/images/small_IR_graph_demonstration.png
+.. image:: ../../assets/images/small_IR_graph_demonstration.png
 
 Vertices in the graph represent layers or operation instances such as convolution, pooling, and element-wise operations with tensors.
 The terms of "layer" and "operation" are used interchangeably within OpenVINO documentation and define how the input data is processed to produce output data for a node in a graph.
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets/opset14.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets/opset14.rst
index 8f8f66a2e51d8a..b06528d3b27dea 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets/opset14.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/available-opsets/opset14.rst
@@ -169,7 +169,6 @@ Table of Contents
 * :doc:`RNNCell <../operation-specs/sequence/rnn-cell-3>`
 * :doc:`RNNSequence <../operation-specs/sequence/rnn-sequence-5>`
 * :doc:`ROIAlign <../operation-specs/detection/roi-align-9>`
-* :doc:`ROIAlignRotated <../operation-specs/detection/roi-align-rotated-14>`
 * :doc:`ROIPooling <../operation-specs/detection/roi-pooling-1>`
 * :doc:`Roll <../operation-specs/movement/roll-7>`
 * :doc:`Round <../operation-specs/arithmetic/round-5>`
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst
index a39de0b72d5a8e..250ef955bb41a8 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst
@@ -188,7 +188,6 @@ Operation Specifications
    RNNSequence-5 <operation-specs/sequence/rnn-sequence-5>
    ROIAlign-3 <operation-specs/detection/roi-align-3>
    ROIAlign-9 <operation-specs/detection/roi-align-9>
-   ROIAlignRotated-14 <operation-specs/detection/roi-align-rotated-14>
    ROIPooling-1 <operation-specs/detection/roi-pooling-1>
    Roll-7 <operation-specs/movement/roll-7>
    Round-5 <operation-specs/arithmetic/round-5>
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-14.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-15.rst
similarity index 95%
rename from docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-14.rst
rename to docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-15.rst
index 7ec8acdd2238b6..1da1e33079c106 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-14.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-15.rst
@@ -1,14 +1,14 @@
-.. {#openvino_docs_ops_detection_ROIAlignRotated_14}
+.. {#openvino_docs_ops_detection_ROIAlignRotated_15}
 ROIAlignRotated
 ===============
 
 
 .. meta::
-  :description: Learn about ROIAlignRotated-14 - an object detection operation, 
+  :description: Learn about ROIAlignRotated-15 - an object detection operation,
                 which can be performed on three required input tensors.
 
 
-**Versioned name**: *ROIAlignRotated-14*
+**Versioned name**: *ROIAlignRotated-15*
 
 **Category**: *Object detection*
 
@@ -56,7 +56,7 @@ Each ROI box's center is shifted by [-0.5, -0.5] before pooling to achive better
 * *spatial_scale*
 
   * **Description**: *spatial_scale* is a multiplicative spatial scale factor to that is applied to the ROI box(height, weight and center vector) before pooling.
-   WARNING! 
+   WARNING!
    Spatial scale is also applied to the center point of the ROI box. It means that scaling does not only change the size of the ROI box, but also its position.
    For example, if the spatial scale is 2.0, ROI box center is [0.5, 0.5], box width is 1.0 and box height is 1.0, then after scaling the ROI box center will be [1.0, 1.0], box width will be 2.0 and box height will be 2.0.
   * **Range of values**: a positive floating-point number
@@ -67,7 +67,7 @@ Each ROI box's center is shifted by [-0.5, -0.5] before pooling to achive better
 
   * **Description**:  If True, the angle for each ROI represents a clockwise rotation, otherwise - counterclockwise rotation.
   * **Type**: ``bool``
-  * **Default value**: False  
+  * **Default value**: False
   * **Required**: *no*
 
 **Inputs**:
diff --git a/docs/articles_en/documentation/openvino-security/openvino-encrypted-models.rst b/docs/articles_en/documentation/openvino-security/openvino-encrypted-models.rst
index 7a18392d2df38a..1a5d7261440c5e 100644
--- a/docs/articles_en/documentation/openvino-security/openvino-encrypted-models.rst
+++ b/docs/articles_en/documentation/openvino-security/openvino-encrypted-models.rst
@@ -22,7 +22,7 @@ After a model is optimized by model conversion API, it's deployed to target devi
 
 Encrypting and optimizing model before deploying it to the edge device can be used to protect deep-learning models. The edge device should keep the stored model protected all the time and have the model decrypted **in runtime only** for use by the OpenVINO Runtime.
 
-.. image:: ../../_static/images/deploy_encrypted_model.svg
+.. image:: ../../assets/images/deploy_encrypted_model.svg
 
 Loading Encrypted Models
 ########################
diff --git a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst
index e6d8b3a4170d04..ba5dd9dec91e65 100644
--- a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst
+++ b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst
@@ -87,13 +87,13 @@ To check if the driver has been installed:
 1. Type **device manager** in the **Search Windows** field and press Enter. **Device Manager** will open.
 2. Click the drop-down arrow to display **Display Adapters**. You can see the adapter that is installed in your computer:
 
-   .. image:: ../../_static/images/DeviceManager.PNG
+   .. image:: ../../assets/images/DeviceManager.PNG
       :width: 400
 
 3. Right-click on the adapter name and select **Properties**.
 4. Click the **Driver** tab to view the driver version.
 
-   .. image:: ../../_static/images/DeviceDriverVersion.svg
+   .. image:: ../../assets/images/DeviceDriverVersion.svg
       :width: 400
 
 Your device driver has been updated and is now ready to use your GPU.
diff --git a/docs/articles_en/learn-openvino/interactive-tutorials-python.rst b/docs/articles_en/learn-openvino/interactive-tutorials-python.rst
index 98719478526bb9..d1215627a4d381 100644
--- a/docs/articles_en/learn-openvino/interactive-tutorials-python.rst
+++ b/docs/articles_en/learn-openvino/interactive-tutorials-python.rst
@@ -48,10 +48,10 @@ Additional Resources
 * `Google Colab <https://colab.research.google.com/>`__
 
 
-.. |binder logo| image:: ../_static/images/launch_in_binder.svg
+.. |binder logo| image:: ../assets/images/launch_in_binder.svg
    :class: notebook-badge-p
    :alt: Binder button
-.. |colab logo| image:: ../_static/images/open_in_colab.svg
+.. |colab logo| image:: ../assets/images/open_in_colab.svg
    :class: notebook-badge-p
    :alt: Google Colab button
 
diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst b/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst
index e7e59ba5755ec0..571743701ce01a 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst
@@ -8,7 +8,7 @@ Tokenizers convert the input text into a sequence of tokens with corresponding I
 the model can understand and process it during inference. The transformation of a sequence of numbers into a
 string is called detokenization.
 
-.. image:: ../../_static/images/tokenization.svg
+.. image:: ../../assets/images/tokenization.svg
    :align: center
 
 There are two important points in the tokenizer-model relation:
diff --git a/docs/articles_en/learn-openvino/openvino-samples/get-started-demos.rst b/docs/articles_en/learn-openvino/openvino-samples/get-started-demos.rst
index 9d0ff76275661e..a0137b0ee25d8f 100644
--- a/docs/articles_en/learn-openvino/openvino-samples/get-started-demos.rst
+++ b/docs/articles_en/learn-openvino/openvino-samples/get-started-demos.rst
@@ -472,7 +472,7 @@ The following command shows how to run the Image Classification Code Sample usin
 
 When the sample application is complete, you are given the label and confidence for the top 10 categories. The input image and sample output of the inference results is shown below:
 
-.. image:: ../../_static/images/dog.png
+.. image:: ../../assets/images/dog.png
 
 .. code-block:: sh
 
diff --git a/docs/articles_en/openvino-workflow.rst b/docs/articles_en/openvino-workflow.rst
index b9a7162f4786a7..9c984b674a28da 100644
--- a/docs/articles_en/openvino-workflow.rst
+++ b/docs/articles_en/openvino-workflow.rst
@@ -6,8 +6,8 @@ OpenVINO Workflow
 
 
 .. meta::
-   :description: OpenVINO toolkit workflow usually involves preparation, 
-                 optimization, and compression of models, running inference and 
+   :description: OpenVINO toolkit workflow usually involves preparation,
+                 optimization, and compression of models, running inference and
                  deploying deep learning applications.
 
 .. toctree::
@@ -20,7 +20,7 @@ OpenVINO Workflow
    Deployment on a Local System  <openvino-workflow/deployment-locally>
    Deployment on a Model Server <ovms_what_is_openvino_model_server>
    openvino-workflow/torch-compile
-   
+
 
 OpenVINO offers multiple workflows, depending on the use case and personal or project preferences.
 This section will give you a detailed view of how you can go from preparing your model,
@@ -33,18 +33,18 @@ you can decide how to proceed:
 
    .. tab-item:: Workflow for convenience
 
-      This approach assumes you run your model directly.      
+      This approach assumes you run your model directly.
 
-      .. image:: _static/images/ov_workflow_diagram_convenience.svg
+      .. image:: ./assets/images/ov_workflow_diagram_convenience.svg
          :align: center
          :alt: OpenVINO workflow diagram for convenience
 
    .. tab-item:: Workflow for performance (recommended for production)
 
       This approach assumes you convert your model to OpenVINO IR explicitly, which means the
-      conversion stage is not part of the final application. 
+      conversion stage is not part of the final application.
 
-      .. image:: _static/images/ov_workflow_diagram_performance.svg
+      .. image:: ./assets/images/ov_workflow_diagram_performance.svg
          :align: center
          :alt: OpenVINO workflow diagram for performance
 
@@ -74,7 +74,7 @@ OpenVINO uses the following functions for reading, converting, and saving models
    .. tab-item:: save_model
 
       * Saves an ov.Model to OpenVINO IR format.
-      * Compresses weights to FP16 by default. 
+      * Compresses weights to FP16 by default.
       * This method is only available in the Python API.
 
 
@@ -82,14 +82,14 @@ OpenVINO uses the following functions for reading, converting, and saving models
 |    Learn how to convert pre-trained models to OpenVINO IR.
 
 | :doc:`Model Optimization and Compression <openvino-workflow/model-optimization>`
-|    Find out how to optimize a model to achieve better inference performance, utilizing 
-     multiple optimization methods for both in-training compression and post-training quantization. 
+|    Find out how to optimize a model to achieve better inference performance, utilizing
+     multiple optimization methods for both in-training compression and post-training quantization.
 
 | :doc:`Running Inference <openvino-workflow/running-inference>`
-|    See how to run inference with OpenVINO, which is the most basic form of deployment, 
+|    See how to run inference with OpenVINO, which is the most basic form of deployment,
      and the quickest way of running a deep learning model.
 
-| :doc:`Deployment Option 1. Using OpenVINO Runtime <openvino-workflow/deployment-locally>` 
+| :doc:`Deployment Option 1. Using OpenVINO Runtime <openvino-workflow/deployment-locally>`
 |    Deploy a model locally, reading the file directly from your application and utilizing about-openvino/additional-resources available to the system.
 |    Deployment on a local system uses the steps described in the section on running inference.
 
diff --git a/docs/articles_en/openvino-workflow/deployment-locally.rst b/docs/articles_en/openvino-workflow/deployment-locally.rst
index 6bdbece9dd119a..657c1f2ce63d4e 100644
--- a/docs/articles_en/openvino-workflow/deployment-locally.rst
+++ b/docs/articles_en/openvino-workflow/deployment-locally.rst
@@ -61,7 +61,7 @@ Granularity of Major Distribution Types
 
 The granularity of OpenVINO packages may vary for different distribution types. For example, the PyPI distribution of OpenVINO has a `single 'openvino' package <https://pypi.org/project/openvino/>`__ that contains all the runtime libraries and plugins, while a :doc:`local distribution <deployment-locally/local-distribution-libraries>` is a more configurable type providing higher granularity. Below are important details of the set of libraries included in the OpenVINO Runtime package:
 
-.. image:: ../_static/images/deployment_simplified.svg
+.. image:: ../assets/images/deployment_simplified.svg
 
 
 - The main library ``openvino`` is used by users' C++ applications to link against with. For C language applications, ``openvino_c`` is additionally required for distribution. The library includes OpenVINO API 2.0.
diff --git a/docs/articles_en/openvino-workflow/deployment-locally/local-distribution-libraries.rst b/docs/articles_en/openvino-workflow/deployment-locally/local-distribution-libraries.rst
index ca471628f0ee23..9889f15c0ecbd9 100644
--- a/docs/articles_en/openvino-workflow/deployment-locally/local-distribution-libraries.rst
+++ b/docs/articles_en/openvino-workflow/deployment-locally/local-distribution-libraries.rst
@@ -35,7 +35,7 @@ Libraries for Pluggable Components
 
 The picture below presents dependencies between the OpenVINO Runtime core and pluggable libraries:
 
-.. image:: ../../_static/images/deployment_full.svg
+.. image:: ../../assets/images/deployment_full.svg
 
 Libraries for Compute Devices
 +++++++++++++++++++++++++++++
diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst
index c1796c87113ca1..ec08b12894e1aa 100644
--- a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst
@@ -48,11 +48,6 @@ To install the latest released version via pip manager run the following command
    pip install nncf
 
 
-.. note::
-
-   To install with specific frameworks, use the `pip install nncf[extras]` command, where extras is a list of possible extras, for example, `torch`, `tf`, `onnx`.
-
-
 To install the latest NNCF version from source, follow the instruction on `GitHub <https://github.com/openvinotoolkit/nncf#installation>`__.
 
 .. note::
@@ -64,7 +59,7 @@ Working with NNCF
 
 The figure below shows a common workflow of applying training-time compressions with NNCF. The NNCF optimizations are added to the TensorFlow or PyTorch training script, and then the model undergoes fine-tuning. The optimized model can then be exported to OpenVINO IR format for accelerated performance with OpenVINO Runtime.
 
-.. image:: ../../_static/images/nncf_workflow.svg
+.. image:: ../../assets/images/nncf_workflow.svg
 
 
 Training-Time Compression Methods
diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-pytorch.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-pytorch.rst
new file mode 100644
index 00000000000000..91b405d43e92b3
--- /dev/null
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-pytorch.rst
@@ -0,0 +1,72 @@
+Quantization-aware Training (QAT) with PyTorch
+===============================================
+
+Below are the steps required to integrate QAT from NNCF into a training script written with
+PyTorch:
+
+
+1. Apply Post Training Quantization to the Model
+##################################################
+
+Quantize the model using the :doc:`Post-Training Quantization <../quantizing-models-post-training/basic-quantization-flow>` method.
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
+   :language: python
+   :fragment: [quantize]
+
+
+2. Fine-tune the Model
+########################
+
+This step assumes applying fine-tuning to the model the same way it is done for the baseline model. For QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5.
+Quantized models perform all computations in floating-point precision during fine-tuning by modeling quantization errors in both forward and backward passes.
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
+   :language: python
+   :fragment: [tune_model]
+
+
+.. note::
+   The precision of weights transitions to INT8 only after converting the model to OpenVINO Intermediate Representation.
+   You can expect a reduction in model footprint only for that format.
+
+
+These steps outline the basics of applying the QAT method from the NNCF. However, in some cases, it is required to save/load model
+checkpoints during training. Since NNCF wraps the original model with its own object, it provides an API for these needs.
+
+3. (Optional) Save Checkpoint
+####################################
+
+To save a model checkpoint, use the following API:
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
+   :language: python
+   :fragment: [save_checkpoint]
+
+
+4. (Optional) Restore from Checkpoint
+################################################
+
+To restore the model from checkpoint, use the following API:
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
+   :language: python
+   :fragment: [load_checkpoint]
+
+
+Deploying the Quantized Model
+###############################
+
+The model can be converted into the OpenVINO Intermediate Representation (IR) if needed, compiled, and run with OpenVINO without any additional steps.
+
+.. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_torch.py
+   :language: python
+   :fragment:  [inference]
+
+For more details, see the corresponding :doc:`documentation <../../running-inference>`.
+
+Examples
+####################
+
+* `Quantization-aware Training of Resnet18 PyTorch Model <https://github.com/openvinotoolkit/nncf/tree/develop/examples/quantization_aware_training/torch/resnet18>`__
+* `Quantization-aware Training of STFPM PyTorch Model <https://github.com/openvinotoolkit/nncf/tree/develop/examples/quantization_aware_training/torch/anomalib>`__
diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-tensorflow.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-tensorflow.rst
new file mode 100644
index 00000000000000..41a2ea615214a8
--- /dev/null
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-tensorflow.rst
@@ -0,0 +1,112 @@
+Quantization-aware Training (QAT) with TensorFlow
+===================================================
+
+Below are the steps required to integrate QAT from NNCF into a training script written with TensorFlow:
+
+.. note::
+   Currently, NNCF for TensorFlow supports optimization of the models created using Keras
+   `Sequential API <https://www.tensorflow.org/guide/keras/sequential_model>`__ or
+   `Functional API <https://www.tensorflow.org/guide/keras/functional>`__.
+
+1. Import NNCF API
+########################
+
+Add NNCF-related imports in the beginning of the training script:
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
+   :language: python
+   :fragment: [imports]
+
+2. Create NNCF Configuration
+####################################
+
+Define NNCF configuration which consists of model-related parameters (the ``"input_info"`` section) and parameters
+of optimization methods (the ``"compression"`` section). For faster convergence, it is also recommended to register a dataset object
+specific to the DL framework. The data object will be used at the model creation step to initialize quantization parameters.
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
+   :language: python
+   :fragment: [nncf_congig]
+
+
+3. Apply Optimization Methods
+####################################
+
+Wrap the original model object with the ``create_compressed_model()`` API using the configuration
+defined in the previous step. This method returns a so-called compression controller and a wrapped model that can be used the
+same way as the original model. Optimization methods are applied at this step, so that the model
+undergoes a set of corresponding transformations and contains additional operations required for optimization. In case of QAT, the compression controller object is used for model export and, optionally, in distributed training as demonstrated below.
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
+   :language: python
+   :fragment: [wrap_model]
+
+
+4. Fine-tune the Model
+####################################
+
+This step assumes applying fine-tuning to the model the same way it is done for the baseline model. For QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5. In principle,
+you can skip this step, meaning that the post-training optimization will be applied to the model.
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
+   :language: python
+   :fragment: [tune_model]
+
+
+5. Multi-GPU Distributed Training
+####################################
+
+In the case of distributed multi-GPU training (not DataParallel), call ``compression_ctrl.distributed()`` before fine-tuning. This informs optimization methods to make adjustments to function in the distributed mode.
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
+   :language: python
+   :fragment: [distributed]
+
+
+.. note::
+   The precision of weights transitions to INT8 only after converting the model to OpenVINO Intermediate Representation.
+   You can expect a reduction in model footprint only for that format.
+
+
+These steps outline the basics of applying the QAT method from the NNCF. However, in some cases, it is required to save/load model
+checkpoints during training. Since NNCF wraps the original model with its own object, it provides an API for these needs.
+
+6. (Optional) Save Checkpoint
+####################################
+
+To save a model checkpoint, use the following API:
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
+   :language: python
+   :fragment: [save_checkpoint]
+
+
+7. (Optional) Restore from Checkpoint
+################################################
+
+To restore the model from checkpoint, use the following API:
+
+.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
+   :language: python
+   :fragment: [load_checkpoint]
+
+
+For more details on saving/loading checkpoints in the NNCF, see the corresponding `NNCF documentation <https://github.com/openvinotoolkit/nncf/blob/develop/docs/Usage.md#saving-and-loading-compressed-models>`__.
+
+Deploying quantized model
+#########################
+
+The model can be converted into the OpenVINO Intermediate Representation (IR) if needed, compiled and run with OpenVINO.
+No extra steps or options are required.
+
+.. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py
+   :language: python
+   :fragment:  [inference]
+
+For more details, see the corresponding :doc:`documentation <../../running-inference>`.
+
+Examples
+####################
+
+* `Quantizing TensorFlow model with NNCF <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/tensorflow-quantization-aware-training>`__
+
diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training.rst
index cce63315939aaf..f5ec455a7e2e15 100644
--- a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training.rst
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training.rst
@@ -1,8 +1,12 @@
-.. {#qat_introduction}
-
 Quantization-aware Training (QAT)
 =================================
 
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   Quantization-aware Training with PyTorch <quantization-aware-training-pytorch>
+   Quantization-aware Training with TensorFlow <quantization-aware-training-tensorflow>
 
 Introduction
 ####################
@@ -12,223 +16,6 @@ degradation caused by quantization. In fact, this is the most accurate quantizat
 apply QAT from the Neural Network Compression Framework (NNCF) to get 8-bit quantized models. This assumes that you are
 knowledgeable in Python programming and familiar with the training code for the model in the source DL framework.
 
-Using NNCF QAT
-####################
-
-Here, we provide the steps that are required to integrate QAT from NNCF into the training script written with
-PyTorch or TensorFlow 2:
-
-.. note::
-   Currently, NNCF for TensorFlow 2 supports optimization of the models created using Keras
-   `Sequential API <https://www.tensorflow.org/guide/keras/sequential_model>`__ or
-   `Functional API <https://www.tensorflow.org/guide/keras/functional>`__.
-
-1. Import NNCF API
-++++++++++++++++++++
-
-In this step, you add NNCF-related imports in the beginning of the training script:
-
-.. tab-set::
-
-   .. tab-item:: PyTorch
-      :sync: pytorch
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
-         :language: python
-         :fragment: [imports]
-
-   .. tab-item:: TensorFlow 2
-      :sync: tensorflow-2
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
-         :language: python
-         :fragment: [imports]
-
-2. Create NNCF configuration
-++++++++++++++++++++++++++++
-
-Here, you should define NNCF configuration which consists of model-related parameters (``"input_info"`` section) and parameters
-of optimization methods (``"compression"`` section). For faster convergence, it is also recommended to register a dataset object
-specific to the DL framework. It will be used at the model creation step to initialize quantization parameters.
-
-.. tab-set::
-
-   .. tab-item:: PyTorch
-      :sync: pytorch
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
-         :language: python
-         :fragment: [nncf_congig]
-
-   .. tab-item:: TensorFlow 2
-      :sync: tensorflow-2
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
-         :language: python
-         :fragment: [nncf_congig]
-
-
-3. Apply optimization methods
-+++++++++++++++++++++++++++++
-
-In the next step, you need to wrap the original model object with the ``create_compressed_model()`` API using the configuration
-defined in the previous step. This method returns a so-called compression controller and a wrapped model that can be used the
-same way as the original model. It is worth noting that optimization methods are applied at this step so that the model
-undergoes a set of corresponding transformations and can contain additional operations required for the optimization. In
-the case of QAT, the compression controller object is used for model export and, optionally, in distributed training as it
-will be shown below.
-
-.. tab-set::
-
-   .. tab-item:: PyTorch
-      :sync: pytorch
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
-         :language: python
-         :fragment: [wrap_model]
-
-   .. tab-item:: TensorFlow 2
-      :sync: tensorflow-2
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
-         :language: python
-         :fragment: [wrap_model]
-
-
-4. Fine-tune the model
-++++++++++++++++++++++
-
-This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the
-case of QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5. In principle,
-you can skip this step which means that the post-training optimization will be applied to the model.
-
-.. tab-set::
-
-   .. tab-item:: PyTorch
-      :sync: pytorch
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
-         :language: python
-         :fragment: [tune_model]
-
-   .. tab-item:: TensorFlow 2
-      :sync: tensorflow-2
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
-         :language: python
-         :fragment: [tune_model]
-
-
-
-5. Multi-GPU distributed training
-+++++++++++++++++++++++++++++++++
-
-In the case of distributed multi-GPU training (not DataParallel), you should call ``compression_ctrl.distributed()`` before
-the fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode.
-
-.. tab-set::
-
-   .. tab-item:: PyTorch
-      :sync: pytorch
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
-         :language: python
-         :fragment: [distributed]
-
-   .. tab-item:: TensorFlow 2
-      :sync: tensorflow-2
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
-         :language: python
-         :fragment: [distributed]
-
-6. Export quantized model
-+++++++++++++++++++++++++
-
-When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in
-the case of PyTorch and frozen graph - for TensorFlow 2.
-
-.. tab-set::
-
-   .. tab-item:: PyTorch
-      :sync: pytorch
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
-         :language: python
-         :fragment: [export]
-
-   .. tab-item:: TensorFlow 2
-      :sync: tensorflow-2
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
-         :language: python
-         :fragment: [export]
-
-
-.. note::
-   The precision of weights gets INT8 only after the step of model conversion to OpenVINO Intermediate Representation.
-   You can expect the model footprint reduction only for that format.
-
-
-These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model
-checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs.
-
-7. (Optional) Save checkpoint
-+++++++++++++++++++++++++++++
-
-To save model checkpoint use the following API:
-
-.. tab-set::
-
-   .. tab-item:: PyTorch
-      :sync: pytorch
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
-         :language: python
-         :fragment: [save_checkpoint]
-
-   .. tab-item:: TensorFlow 2
-      :sync: tensorflow-2
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
-         :language: python
-         :fragment: [save_checkpoint]
-
-
-8. (Optional) Restore from checkpoint
-+++++++++++++++++++++++++++++++++++++
-
-To restore the model from checkpoint you should use the following API:
-
-.. tab-set::
-
-   .. tab-item:: PyTorch
-      :sync: pytorch
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py
-         :language: python
-         :fragment: [load_checkpoint]
-
-   .. tab-item:: TensorFlow 2
-      :sync: tensorflow-2
-
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py
-         :language: python
-         :fragment: [load_checkpoint]
-
-
-For more details on saving/loading checkpoints in the NNCF, see the following `documentation <https://github.com/openvinotoolkit/nncf/blob/develop/docs/Usage.md#saving-and-loading-compressed-models>`__.
-
-Deploying quantized model
-#########################
-
-The quantized model can be deployed with OpenVINO in the same way as the baseline model. No extra steps or options are
-required in this case. For more details, see the corresponding :doc:`documentation <../../running-inference>`.
-
-Examples
-####################
-
-* `Quantizing PyTorch model with NNCF <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/pytorch-quantization-aware-training>`__
-
-* `Quantizing TensorFlow model with NNCF <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/tensorflow-quantization-aware-training>`__
+:doc:`Quantization-aware Training with PyTorch <quantization-aware-training-pytorch>`
 
+:doc:`Quantization-aware Training with TensorFlow <quantization-aware-training-tensorflow>`
\ No newline at end of file
diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training.rst
index ae1a87c0260cd1..d34da0d615f3bc 100644
--- a/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training.rst
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training.rst
@@ -24,7 +24,7 @@ speed, mostly due to reduced throughput. The reduction is performed before the a
 when the model gets transformed into the quantized representation. The process does not require
 any training datasets or pipelines in the source DL framework.
 
-.. image:: ../../_static/images/quantization_picture.svg
+.. image:: ../../assets/images/quantization_picture.svg
 
 `Neural Network Compression Framework (NNCF) <https://github.com/openvinotoolkit/nncf>`__
 provides a post-training quantization API, available in Python, that aims at reusing the code for
diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training/basic-quantization-flow.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training/basic-quantization-flow.rst
index de0b0f96cc0e1d..2b2136c6a255d2 100644
--- a/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training/basic-quantization-flow.rst
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training/basic-quantization-flow.rst
@@ -106,7 +106,7 @@ See the `example section <#examples-of-how-to-apply-nncf-post-training-quantizat
          
 
 After that the model can be converted into the OpenVINO Intermediate Representation (IR) if needed, compiled and run with OpenVINO.
-If you have not already installed OpenVINO developer tools, install it with ``pip install openvino-dev``.
+If you have not already installed OpenVINO developer tools, install it with ``pip install openvino``.
 
 .. tab-set::
 
diff --git a/docs/articles_en/openvino-workflow/model-optimization.rst b/docs/articles_en/openvino-workflow/model-optimization.rst
index 1203b99f6486ea..2cf08990d6b1d6 100644
--- a/docs/articles_en/openvino-workflow/model-optimization.rst
+++ b/docs/articles_en/openvino-workflow/model-optimization.rst
@@ -17,21 +17,21 @@ Model optimization is an optional offline step of improving the final model perf
 
 - :doc:`Post-training Quantization <model-optimization-guide/quantizing-models-post-training>` is designed to optimize the inference of deep learning models by applying the post-training 8-bit integer quantization that does not require model retraining or fine-tuning.
 
-- :doc:`Training-time Optimization <model-optimization-guide/compressing-models-during-training>`, a suite of advanced methods for training-time model optimization within the DL framework, such as PyTorch and TensorFlow 2.x. It supports methods like Quantization-aware Training, Structured and Unstructured Pruning, etc. 
+- :doc:`Training-time Optimization <model-optimization-guide/compressing-models-during-training>`, a suite of advanced methods for training-time model optimization within the DL framework, such as PyTorch and TensorFlow 2.x. It supports methods like Quantization-aware Training, Structured and Unstructured Pruning, etc.
 
 - :doc:`Weight Compression <model-optimization-guide/weight-compression>`, an easy-to-use method for Large Language Models footprint reduction and inference acceleration.
 
 .. note:: OpenVINO also supports optimized models (for example, quantized) from source frameworks such as PyTorch, TensorFlow, and ONNX (in Q/DQ; Quantize/DeQuantize format). No special steps are required in this case and optimized models can be converted to the OpenVINO Intermediate Representation format (IR) right away.
 
-Post-training Quantization is the fastest way to optimize an arbitrary DL model and should be applied first, but it is limited in terms of achievable accuracy-performance trade-off. The recommended approach to obtain OpenVINO quantized model is to convert a model from original framework to ``ov.Model`` and ensure that the model works correctly in OpenVINO, for example, by calculating the model metrics. Then, ``ov.Model`` can be used as input for the ``nncf.quantize()`` method to get the quantized model (see the diagram below).
+Post-training Quantization is the fastest way to optimize an arbitrary DL model and should be applied first, but it is limited in terms of achievable accuracy-performance trade-off. The recommended approach to obtain OpenVINO quantized model is to convert a model from original framework to ``ov.Model`` and ensure that the model works correctly in OpenVINO, for example, by calculating the model metrics. Then, ``ov.Model`` can be used as input for the ``nncf.quantize()`` method to get the quantized model or as input for the ``nncf.compress_weights()`` method to compress weights of Large Language Models (see the diagram below).
 
 In case of unsatisfactory accuracy or performance after Post-training Quantization, Training-time Optimization can be used as an option.
 
-.. image:: ../_static/images/DEVELOPMENT_FLOW_V3_crunch.svg
+.. image:: ../assets/images/DEVELOPMENT_FLOW_V3_crunch.svg
 
 Once the model is optimized using the aforementioned methods, it can be used for inference using the regular OpenVINO inference workflow. No changes to the inference code are required.
 
-.. image:: ../_static/images/WHAT_TO_USE.svg
+.. image:: ../assets/images/WHAT_TO_USE.svg
 
 Additional Resources
 ####################
diff --git a/docs/articles_en/openvino-workflow/running-inference.rst b/docs/articles_en/openvino-workflow/running-inference.rst
index 13e9d650914bb3..3ccd9f3ff7cc2e 100644
--- a/docs/articles_en/openvino-workflow/running-inference.rst
+++ b/docs/articles_en/openvino-workflow/running-inference.rst
@@ -49,6 +49,6 @@ OpenVINO Runtime uses a plugin architecture. Its plugins are software components
 The scheme below illustrates the typical workflow for deploying a trained deep learning model:
 
 
-.. image:: ../_static/images/BASIC_FLOW_IE_C.svg
+.. image:: ../assets/images/BASIC_FLOW_IE_C.svg
 
 
diff --git a/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst b/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst
index 6e019d85c35ffb..d8b76ef31545b4 100644
--- a/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst
@@ -47,7 +47,7 @@ to set a new batch size with the ``reshape`` method:
 The diagram below presents the results of using the method, where the size of
 model input is changed with an image input:
 
-.. image:: ../../_static/images/original_vs_reshaped_model.svg
+.. image:: ../../assets/images/original_vs_reshaped_model.svg
 
 When using the ``reshape`` method, you may take one of the approaches:
 
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst
index d20ec78f9407a2..d087e369ff117d 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst
@@ -73,7 +73,7 @@ input/output or :doc:`stateful operations <../stateful-models>`
 will be loaded to the CPU if it is in the candidate list. Otherwise,
 these models will follow the normal flow and be loaded to the device based on priority.
 
-.. image:: ../../../_static/images/autoplugin_accelerate.svg
+.. image:: ../../../assets/images/autoplugin_accelerate.svg
 
 
 This mechanism can be easily observed in the :ref:`Using AUTO with Benchmark app sample <using-auto-with-openvino-samples-and-benchmark-app>`
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection/debugging-auto-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection/debugging-auto-device.rst
index b0cde79f630363..46ff9165d24c11 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection/debugging-auto-device.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection/debugging-auto-device.rst
@@ -103,27 +103,27 @@ With Intel® VTune™ Profiler installed you can configure your analysis with th
 
 3. In the **where** pane, select **Local Host**
 
-   .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png
+   .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png
       :align: center
 
 4. In the **what** pane, specify your target application/script on the local system.
 
-   .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png
+   .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png
       :align: center
 
 5. In the **how** pane, choose and configure the analysis type you want to perform, for example, **Hotspots Analysis**: identify the most time-consuming functions and drill down to see time spent on each line of source code. Focus optimization efforts on hot code for the greatest performance impact.
 
-   .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png
+   .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png
       :align: center
 
 6. Start the analysis by clicking the start button. When it is done, you will get a summary of the run, including top hotspots and top tasks in your application:
 
-   .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png
+   .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png
       :align: center
 
 7. To analyze ITT info related to the Auto plugin, click on the **Bottom-up** tab, choose the **Task Domain/Task Type/Function/Call Stack** from the dropdown list - Auto plugin-related ITT info is under the MULTIPlugin task  domain:
 
-   .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png
+   .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png
       :align: center
 
 
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
index 2d72b49fcf225a..6f817349800590 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
@@ -101,6 +101,11 @@ On platforms that natively support half-precision calculations (``bfloat16`` or
 of ``f32`` to achieve better performance (see the `Execution Mode Hint <#execution-mode-hint>`__).
 Thus, no special steps are required to run a model with ``bf16`` or ``f16`` inference precision.
 
+.. important::
+
+   The ``bf16`` floating-point precision appears to have some limitations that impact the
+   inference accuracy in LLM models. For more details, refer to this :ref:`article <limited_inference_precision>`.
+
 Using the half-precision provides the following performance benefits:
 
 - ``bfloat16`` and ``float16`` data types enable Intel® Advanced Matrix Extension (AMX) on 4+ generation Intel® Xeon® Scalable Processors, resulting in significantly faster computations on the corresponding hardware compared to AVX512 or AVX2 instructions in many deep learning operation implementations.
diff --git a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst
index 3d62354ff51586..ce5e6fd20722a1 100644
--- a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst
@@ -26,7 +26,7 @@ for Windows PowerShell, or ``setupvars.bat`` for Windows CMD). Otherwise, the ``
 variable won't be configured properly to pass ``find_package`` calls.
 
 
-.. image:: ../../_static/images/IMPLEMENT_PIPELINE_with_API_C.svg
+.. image:: ../../assets/images/IMPLEMENT_PIPELINE_with_API_C.svg
 
 
 Step 1. Create OpenVINO Runtime Core
diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst
index 080c297ed2565a..d7520f57315ab0 100644
--- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst
@@ -43,7 +43,7 @@ The key advantage of the Async approach is that when a device is busy with the i
 
 In the example below, inference is applied to the results of the video decoding. It is possible to keep two parallel infer requests, and while the current one is processed, the input frame for the next one is being captured. This essentially hides the latency of capturing, so that the overall frame rate is rather determined only by the slowest part of the pipeline (decoding vs inference) and not by the sum of the stages.
 
-.. image:: ../../../_static/images/synch-vs-asynch.svg
+.. image:: ../../../assets/images/synch-vs-asynch.svg
    :alt: Intel® VTune™ screenshot
 
 Below are example-codes for the regular and async-based approaches to compare:
diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst
index f3431bfd10b135..7d19e17a70f2c6 100644
--- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst
@@ -35,7 +35,7 @@ Consider the following standard example: deep learning model expects input with
 * For each pixel, subtract mean values and divide by scale factor.
 
 
-.. image:: ../../../_static/images/preprocess_not_fit.png
+.. image:: ../../../assets/images/preprocess_not_fit.png
 
 
 Even though it is relatively easy to implement all these steps in the application code manually, before actual inference, it is also possible with the use of Preprocessing API. Advantages of using the API are:
diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst
index d24d817e760f46..38af00d3796d5d 100644
--- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst
@@ -67,7 +67,7 @@ If the device does not support the import/export capability, cache is not create
 Note that the first ``compile_model`` operation takes slightly longer, as the cache needs to be created -
 the compiled blob is saved into a cache file:
 
-.. image:: ../../../../_static/images/caching_enabled.svg
+.. image:: ../../../../assets/images/caching_enabled.svg
 
 
 Make it even faster: use compile_model(modelPath)
@@ -113,7 +113,7 @@ With model caching enabled, the total load time is even shorter, if ``read_model
          :fragment: [ov:caching:part2]
 
 
-.. image:: ../../../../_static/images/caching_times.svg
+.. image:: ../../../../assets/images/caching_times.svg
 
 Advanced Examples
 ++++++++++++++++++++
diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-low-level-implementation.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-low-level-implementation.rst
index f948ca0c590d4b..1259b65fe04c49 100644
--- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-low-level-implementation.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-low-level-implementation.rst
@@ -5,8 +5,8 @@ Further Low-Level Implementation Details
 
 
 .. meta::
-   :description: Automatic Batching moves asynchronicity from individual 
-                 requests to groups of requests, and the CPU streams are 
+   :description: Automatic Batching moves asynchronicity from individual
+                 requests to groups of requests, and the CPU streams are
                  inference threads grouped by CPU cores.
 
 
@@ -32,9 +32,9 @@ This provides much better performance for the networks than batching, especially
        | Requests are executed in parallel with a small number of threads.
        | Layer-wise, the streams imply much less synchronization.
 
-.. |conventional-approach| image:: ../../../_static/images/cpu_execution_conventional_approach.svg
+.. |conventional-approach| image:: ../../../assets/images/cpu_execution_conventional_approach.svg
 
-.. |execution-streams| image:: ../../../_static/images/cpu_execution_streams.svg
+.. |execution-streams| image:: ../../../assets/images/cpu_execution_streams.svg
 
 Compared to the batching, the parallelism is somewhat transposed (performed over inputs with much less synchronization within CNN ops):
 
@@ -53,9 +53,9 @@ Compared to the batching, the parallelism is somewhat transposed (performed over
      - | |execution-streams-2|
        | Inputs-wise the streams are the “transposed” batch.
 
-.. |large-batch-approach| image:: ../../../_static/images/large_batch_approach.svg
+.. |large-batch-approach| image:: ../../../assets/images/large_batch_approach.svg
 
-.. |execution-streams-2| image:: ../../../_static/images/cpu_execution_streams_2.svg
+.. |execution-streams-2| image:: ../../../assets/images/cpu_execution_streams_2.svg
 
 
 Keep in mind that :doc:`high-level performance hints <high-level-performance-hints>` allow the implementation to select the optimal number of streams depending on model's compute demands and CPU capabilities, including :doc:`int8 inference <../../model-optimization>` hardware acceleration, number of cores, etc.
@@ -63,15 +63,15 @@ Keep in mind that :doc:`high-level performance hints <high-level-performance-hin
 Automatic Batching Internals
 ############################
 
-:doc:`Automatic batching <../inference-devices-and-modes/automatic-batching>` performs on-the-fly grouping of inference requests to improve device utilization. 
+:doc:`Automatic batching <../inference-devices-and-modes/automatic-batching>` performs on-the-fly grouping of inference requests to improve device utilization.
 It relaxes the requirement for an application to saturate devices such as GPU by using a large batch "explicitly". It performs transparent input gathering from individual inference requests followed by the actual batched execution, with no programming effort from the user:
 
-.. image:: ../../../_static/images/batch_device.svg
+.. image:: ../../../assets/images/batch_device.svg
 
-Essentially, Automatic Batching shifts asynchronicity from individual requests to groups of requests that constitute the batches. Furthermore, for the execution to be efficient, it is very important that the requests arrive timely, without causing a batching timeout. 
+Essentially, Automatic Batching shifts asynchronicity from individual requests to groups of requests that constitute the batches. Furthermore, for the execution to be efficient, it is very important that the requests arrive timely, without causing a batching timeout.
 Normally, the timeout should never be hit. It is rather a graceful way to handle the application exit (when the inputs are not arriving anymore, so the full batch is not possible to collect).
 
 If a workload experiences timeouts, which lead to a drop in performance due to increased latency of every request, consider balancing its value against the batch size. For example, a smaller batch size and timeout value may yield better results than a large batch size coupled with a timeout value that cannot guarantee accommodating all the required requests.
 
-Finally, following the ``get_tensor`` idiom section from the :doc:`general optimizations <general-optimizations>` helps Automatic Batching to save on inputs/outputs copies. According to that, you should always prefer the "get" versions of the tensors' data access APIs in your applications. 
+Finally, following the ``get_tensor`` idiom section from the :doc:`general optimizations <general-optimizations>` helps Automatic Batching to save on inputs/outputs copies. According to that, you should always prefer the "get" versions of the tensors' data access APIs in your applications.
 
diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/precision-control.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/precision-control.rst
index afc333c89504ba..944b6de3032d60 100644
--- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/precision-control.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/precision-control.rst
@@ -18,18 +18,40 @@ of the weights, and it does not affect how the devices execute the model. This c
 a lot of confusion where, for example, you couldn't execute a high-performance model on the GPU
 by default, and the behavior between devices was different.
 
-This guide will focus on how to control inference precision. And using lower precision is important for performance because compute bandwidth tends to be higher for smaller data types, and hardware often has special blocks for efficient multiply-accumulate operations with smaller data types only (e.g. Intel Xᵉ Matrix Extensions (XMX) on GPU and Intel Advanced Matrix Extensions (AMX) on CPU do not support ``f32``). Also, I/O operations requires less memory due to the smaller tensor byte size. This guide will focus on how to control inference precision.
+This guide will focus on how to control inference precision. And using lower precision is
+important for performance because compute bandwidth tends to be higher for smaller data
+types, and hardware often has special blocks for efficient multiply-accumulate operations
+with smaller data types only (e.g. Intel Xᵉ Matrix Extensions (XMX) on GPU and Intel
+Advanced Matrix Extensions (AMX) on CPU do not support ``f32``). Also, I/O operations
+requires less memory due to the smaller tensor byte size. This guide will focus on how
+to control inference precision.
 
 
 Execution Mode
 ##############
 
-``ov::hint::execution_mode`` is a high-level hint to control whether the user wants to keep the best accuracy (**ACCURACY mode**) or if the device can do some optimizations that may lower the accuracy for performance reasons (**PERFORMANCE mode**)
-
-* In **ACCURACY mode**, the device cannot convert floating point tensors to a smaller floating point type, so devices try to keep the accuracy metrics as close as possible to the original values ​​obtained after training relative to the device's real capabilities. This means that most devices will infer with ``f32`` precision if your device supports it.
-* In **PERFORMANCE mode**, the device can convert to smaller data types and apply other optimizations that may have some impact on accuracy rates, although we still try to minimize accuracy loss and may use mixed precision execution in some cases.
-
-If the model has been quantized using :doc:`OpenVINO optimization tools <../../model-optimization-guide/quantizing-models-post-training>` or any other method, the quantized operators will be executed with the target integer precision if the device has hardware acceleration for that type. For example, quantized ``int8`` primitives are executed with ``int8`` precision for both **ACCURACY** and **PERFORMANCE modes** if the device provides higher compute bandwidth for 8-bit data types compared to any available floating-point type. On the other hand, devices without hardware acceleration for the ``int8`` data type can keep such operators in floating point precision, and the exact floating point type will be affected by ``execution_mode`` and ``inference_precision`` properties.
+``ov::hint::execution_mode`` is a high-level hint to control whether the user wants to keep
+the best accuracy (**ACCURACY mode**) or if the device can do some optimizations that
+may lower the accuracy for performance reasons (**PERFORMANCE mode**)
+
+* In **ACCURACY mode**, the device cannot convert floating point tensors to a smaller
+  floating point type, so devices try to keep the accuracy metrics as close as possible to
+  the original values ​​obtained after training relative to the device's real capabilities.
+  This means that most devices will infer with ``f32`` precision if your device supports it.
+* In **PERFORMANCE mode**, the device can convert to smaller data types and apply other
+  optimizations that may have some impact on accuracy rates, although we still try to
+  minimize accuracy loss and may use mixed precision execution in some cases.
+
+If the model has been quantized using
+:doc:`OpenVINO optimization tools <../../model-optimization-guide/quantizing-models-post-training>`
+or any other method, the quantized operators will be executed with the target integer
+precision if the device has hardware acceleration for that type. For example, quantized
+``int8`` primitives are executed with ``int8`` precision for both **ACCURACY** and
+**PERFORMANCE modes** if the device provides higher compute bandwidth for 8-bit data types
+compared to any available floating-point type. On the other hand, devices without hardware
+acceleration for the ``int8`` data type can keep such operators in floating point precision,
+and the exact floating point type will be affected by ``execution_mode`` and
+``inference_precision`` properties.
 
 Code examples:
 
@@ -53,11 +75,43 @@ Code examples:
 Inference Precision
 ###################
 
-``ov::hint::inference_precision`` precision is a lower-level property that allows you to specify the exact precision the user wants, but is less portable. For example, CPU supports ``f32`` inference precision and ``bf16`` on some platforms, GPU supports ``f32`` and ``f16``, so if a user wants to an application that uses multiple devices, they have to handle all these combinations manually or let OV do it automatically by using higher level ``execution_mode`` property. Another thing is that ``inference_precision`` is also a hint, so the value provided is not guaranteed to be used by Runtime (mainly in cases where the current device does not have the required hardware capabilities).
+``ov::hint::inference_precision`` precision is a lower-level property that allows you
+to specify the exact precision the user wants, but is less portable. For example, CPU
+supports ``f32`` inference precision and ``bf16`` on some platforms, GPU supports ``f32``
+and ``f16``, so if a user wants to an application that uses multiple devices, they have
+to handle all these combinations manually or let OV do it automatically by using higher
+level ``execution_mode`` property. Another thing is that ``inference_precision`` is also
+a hint, so the value provided is not guaranteed to be used by Runtime (mainly in cases
+where the current device does not have the required hardware capabilities).
 
 .. note::
 
-   All devices only support floating-point data types (``f32``, ``f16``, ``bf16``) as a value for ``inference_precision`` attribute, because quantization cannot be done in Runtime.
+   All devices only support floating-point data types (``f32``, ``f16``, ``bf16``) as a value
+   for ``inference_precision`` attribute, because quantization cannot be done in Runtime.
+
+
+.. _limited_inference_precision:
+
+Limitation of the ``bf16`` inference precision
+++++++++++++++++++++++++++++++++++++++++++++++
+
+It is important to mention that inferring FP16 and FP32 LLM models with the ``bf16`` runtime
+precision may result in higher accuracy loss than the pre-determined threshold of 0.5%.
+Higher accuracy drop may occur when inferring **dolly-v2-12b**, **dolly-v2-3b**, and
+**gpt-neox-20b** original Pytorch models with ``bf16``, and is caused by a limited
+precision representation.
+
+To solve the issue, you might use an INT8 model and force the FP32 inference precision.
+The accuracy of an INT8 model with FP32 is nearly the same as of an FP16 model with ``f32``.
+Additionally, selective FP32 execution of ops on CPU plugin together with the NNCF ``bf16``
+calibration could potentially mitigate the accuracy loss.
+
+However, the solutions mentioned above would, unfortunately, also result in significant
+performance drop during a large batch size inference task on machines with Intel AMX-BF16 SPR.
+In such cases, the fused multiply-add operation (FMA) is used instead of AMX. Also,
+in a compute-bound case, such as the LLM batch inference/serving, these workarounds
+would drastically reduce the throughput by more than 60%.
+
 
 
 Additional Resources
diff --git a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst
index eec850fa1cd18b..49c70cb964cb87 100644
--- a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst
@@ -16,7 +16,7 @@ output. In contrast, for a "stateless" model to pass data between runs, all prod
 returned as output and needs to be handled by the application itself for reuse at the next
 execution.
 
-.. image:: ../../_static/images/stateful_model_example.svg
+.. image:: ../../assets/images/stateful_model_example.svg
    :alt: example comparison between stateless and stateful model implementations
    :align: center
    :scale: 90 %
@@ -113,7 +113,7 @@ states.
   each run performed in a different infer request than the previous one would require the state
   to be set "manually", using the ``ov::VariableState::set_state`` method.
 
-.. image:: ../../_static/images/stateful_model_init_subgraph.svg
+.. image:: ../../assets/images/stateful_model_init_subgraph.svg
    :alt: diagram of how initial state value is set or reset
    :align: center
    :scale: 100 %
diff --git a/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst b/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst
index 2c005a9bd8a3f3..67e70c9b999f0c 100644
--- a/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst
@@ -28,7 +28,7 @@ MakeStateful Transformation
 The MakeStateful transformation changes the structure of the model by replacing the
 user-defined pairs of Parameter and Results with the Assign and ReadValue operations:
 
-.. image:: ../../../_static/images/make_stateful_simple.svg
+.. image:: ../../../assets/images/make_stateful_simple.svg
    :alt: diagram of MakeStateful Transformation
    :scale: 90 %
    :align: center
@@ -44,7 +44,7 @@ Parameter/Result tensor names. If there are no tensor names,
 
 **Examples:**
 
-.. image:: ../../../_static/images/make_stateful_detailed.png
+.. image:: ../../../assets/images/make_stateful_detailed.png
    :alt: detailed diagram of MakeStateful Transformation
    :align: center
 
@@ -91,7 +91,7 @@ and :doc:`Loop <../../../documentation/openvino-ir-format/operation-sets/operati
 and replacing pairs of Parameter and Results with the Assign and ReadValue operations,
 as illustrated by the following example:
 
-.. image:: ../../../_static/images/applying_low_latency_2.svg
+.. image:: ../../../assets/images/applying_low_latency_2.svg
    :alt: diagram of LowLatency Transformation
    :align: center
 
@@ -162,7 +162,7 @@ Applying LowLatency2 Transformation
             :fragment: [ov:low_latency_2_use_parameters]
 
 
-   .. image:: ../../../_static/images/llt2_use_const_initializer.svg
+   .. image:: ../../../assets/images/llt2_use_const_initializer.svg
       :alt: diagram of constant subgraph initialization
       :align: center
 
@@ -184,7 +184,7 @@ Applying LowLatency2 Transformation
 4. Use state API. See sections :doc:`OpenVINO State API <../stateful-models>`,
    :ref:`Stateful Model Inference <ov_ug_stateful_model_inference>`.
 
-   .. image:: ../../../_static/images/low_latency_limitation_2.svg
+   .. image:: ../../../assets/images/low_latency_limitation_2.svg
       :alt: diagram showing low latency limitation
       :scale: 70 %
       :align: center
diff --git a/docs/articles_en/openvino-workflow/torch-compile.rst b/docs/articles_en/openvino-workflow/torch-compile.rst
index 02e2364c339a94..57682f2e143cd9 100644
--- a/docs/articles_en/openvino-workflow/torch-compile.rst
+++ b/docs/articles_en/openvino-workflow/torch-compile.rst
@@ -35,7 +35,7 @@ any additional PyTorch-based tracing/scripting.
 
 Execution diagram:
 
-.. image:: ../_static/images/torch_compile_backend_openvino.svg
+.. image:: ../assets/images/torch_compile_backend_openvino.svg
    :width: 992px
    :height: 720px
    :scale: 60%
diff --git a/docs/home.rst b/docs/home.rst
index 524a3ce5b48345..08b2e8d62cc340 100644
--- a/docs/home.rst
+++ b/docs/home.rst
@@ -57,7 +57,7 @@ Check out the `OpenVINO Cheat Sheet. <https://docs.openvino.ai/2024/_static/down
 |
 |
 
-.. image:: _static/images/openvino-overview-diagram.jpg
+.. image:: /docs/articles_en/assets/images/openvino-overview-diagram.jpg
    :align: center
    :alt: openvino diagram
 
@@ -70,7 +70,7 @@ Places to Begin
    :class-container: ov-homepage-higlight-grid
 
    .. grid-item-card:: Installation
-      :img-top: ./_static/images/home_begin_tile_01.png
+      :img-top: /docs/articles_en/assets/images/home_begin_tile_01.png
       :class-card: homepage_begin_tile
 
       This guide introduces installation and learning materials for Intel® Distribution of OpenVINO™ toolkit.
@@ -82,7 +82,7 @@ Places to Begin
          Get Started
 
    .. grid-item-card:: Performance Benchmarks
-      :img-top: ./_static/images/home_begin_tile_02.png
+      :img-top: /docs/articles_en/assets/images/home_begin_tile_02.png
       :class-card: homepage_begin_tile
 
       See latest benchmark numbers for OpenVINO and OpenVINO Model Server.
@@ -94,7 +94,7 @@ Places to Begin
          View data
 
    .. grid-item-card:: Framework Compatibility
-      :img-top: ./_static/images/home_begin_tile_03.png
+      :img-top: /docs/articles_en/assets/images/home_begin_tile_03.png
       :class-card: homepage_begin_tile
 
       Load models directly (for TensorFlow, ONNX, PaddlePaddle) or convert to OpenVINO format.
@@ -106,7 +106,7 @@ Places to Begin
          Load your model
 
    .. grid-item-card:: Easy Deployment
-      :img-top: ./_static/images/home_begin_tile_04.png
+      :img-top: /docs/articles_en/assets/images/home_begin_tile_04.png
       :class-card: homepage_begin_tile
 
       Get started in just a few lines of code.
@@ -118,7 +118,7 @@ Places to Begin
          Run Inference
 
    .. grid-item-card:: Serving at scale
-      :img-top: ./_static/images/home_begin_tile_05.png
+      :img-top: /docs/articles_en/assets/images/home_begin_tile_05.png
       :class-card: homepage_begin_tile
 
       Cloud-ready deployments for microservice applications.
@@ -130,7 +130,7 @@ Places to Begin
          Try it out
 
    .. grid-item-card:: Model Compression
-      :img-top: ./_static/images/home_begin_tile_06.png
+      :img-top: /docs/articles_en/assets/images/home_begin_tile_06.png
       :class-card: homepage_begin_tile
 
       Reach for performance with post-training and training-time compression with NNCF.
@@ -151,25 +151,25 @@ Key Features
    :class-container: homepage_begin_container
 
    .. grid-item-card:: Model Compression
-      :img-top: ./_static/images/home_key_feature_01.png
+      :img-top: /docs/articles_en/assets/images/home_key_feature_01.png
       :class-card: homepage_begin_key
 
       You can either link directly with OpenVINO Runtime to run inference locally or use OpenVINO Model Server to serve model inference from a separate server or within a Kubernetes environment.
 
    .. grid-item-card:: Fast & Scalable Deployment
-      :img-top: ./_static/images/home_key_feature_02.png
+      :img-top: /docs/articles_en/assets/images/home_key_feature_02.png
       :class-card: homepage_begin_key
 
       Write an application once, deploy it anywhere, achieving maximum performance from hardware. Automatic device discovery allows for superior deployment flexibility. OpenVINO Runtime supports Linux, Windows, and MacOS and provides Python, C++, and C API. Use your preferred language and OS.
 
    .. grid-item-card:: Lighter Deployment
-      :img-top: ./_static/images/home_key_feature_03.png
+      :img-top: /docs/articles_en/assets/images/home_key_feature_03.png
       :class-card: homepage_begin_key
 
       Designed with minimal external dependencies reduces the application footprint, simplifying installation and dependency management. Popular package managers enable application dependencies to be easily installed and upgraded. Custom compilation for your specific model(s) further reduces the final binary size.
 
    .. grid-item-card:: Enhanced App Start-Up Time
-      :img-top: ./_static/images/home_key_feature_04.png
+      :img-top: /docs/articles_en/assets/images/home_key_feature_04.png
       :class-card: homepage_begin_key
 
       In applications where fast start-up is required, OpenVINO significantly reduces first-inference latency by using the CPU for initial inference and then switching to another device once the model has been compiled and loaded to memory. Compiled models are cached, improving start-up time even more.
diff --git a/docs/nbdoc/consts.py b/docs/nbdoc/consts.py
index 5e0618997e38ec..de6fc28aea1d82 100644
--- a/docs/nbdoc/consts.py
+++ b/docs/nbdoc/consts.py
@@ -14,15 +14,15 @@
 file_with_binder_notebooks = Path('../../docs/notebooks/notebooks_with_binder_buttons.txt').resolve(strict=True)
 file_with_colab_notebooks = Path('../../docs/notebooks/notebooks_with_colab_buttons.txt').resolve(strict=True)
 openvino_notebooks_ipynb_list = Path('../../docs/notebooks/all_notebooks_paths.txt').resolve(strict=True)
-binder_image_source = Path('../../docs/sphinx_setup/_static/images/launch_in_binder.svg').resolve(strict=True)
+binder_image_source = Path('../../docs/articles_en/assets/images/launch_in_binder.svg').resolve(strict=True)
 binder_image_source_data = open(binder_image_source, 'rb').read()
 binder_image_source_data_base64 = base64.b64encode(binder_image_source_data)
 binder_image_base64 = binder_image_source_data_base64.decode()
-colab_image_source = Path('../../docs/sphinx_setup/_static/images/open_in_colab.svg').resolve(strict=True)
+colab_image_source = Path('../../docs/articles_en/assets/images/open_in_colab.svg').resolve(strict=True)
 colab_image_source_data = open(colab_image_source, 'rb').read()
 colab_image_source_data_base64 = base64.b64encode(colab_image_source_data)
 colab_image_base64 = colab_image_source_data_base64.decode()
-github_image_source = Path('../../docs/sphinx_setup/_static/images/view_on_github.svg').resolve(strict=True)
+github_image_source = Path('../../docs/articles_en/assets/images/view_on_github.svg').resolve(strict=True)
 github_image_source_data = open(github_image_source, 'rb').read()
 github_image_source_data_base64 = base64.b64encode(github_image_source_data)
 github_image_base64 = github_image_source_data_base64.decode()
diff --git a/docs/optimization_guide/nncf/code/qat_torch.py b/docs/optimization_guide/nncf/code/qat_torch.py
index 121417e4ab08b3..f80a7e8f9aea9f 100644
--- a/docs/optimization_guide/nncf/code/qat_torch.py
+++ b/docs/optimization_guide/nncf/code/qat_torch.py
@@ -1,53 +1,29 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-#! [imports]
-import torch
-import nncf  # Important - should be imported right after torch
-from nncf import NNCFConfig
-from nncf.torch import create_compressed_model, register_default_init_args
-#! [imports]
-
-#! [nncf_congig]
-nncf_config_dict = {
-    "input_info": {"sample_size": [1, 3, 224, 224]}, # input shape required for model tracing
-    "compression": {
-        "algorithm": "quantization",  # 8-bit quantization with default settings
-    },
-}
-nncf_config = NNCFConfig.from_dict(nncf_config_dict)
-nncf_config = register_default_init_args(nncf_config, train_loader) # train_loader is an instance of torch.utils.data.DataLoader
-#! [nncf_congig]
-
-#! [wrap_model]
+#! [quantize]
 model = TorchModel() # instance of torch.nn.Module
-compression_ctrl, model = create_compressed_model(model, nncf_config) 
-#! [wrap_model]
-
-#! [distributed]
-compression_ctrl.distributed() # call it before the training loop
-#! [distributed]
+model = nncf.quantize(model, ...)
+#! [quantize]
 
 #! [tune_model]
 ... # fine-tuning preparations, e.g. dataset, loss, optimizer setup, etc.
 
 # tune quantized model for 5 epochs as the baseline
 for epoch in range(0, 5):
-    compression_ctrl.scheduler.epoch_step() # Epoch control API
-
     for i, data in enumerate(train_loader):
-        compression_ctrl.scheduler.step()   # Training iteration control API 
         ... # training loop body
 #! [tune_model]
 
 #! [export]
-compression_ctrl.export_model("compressed_model.onnx")
-#! [export] 
+# example_input is an example input to make it possible to trace the model
+torch.onnx.export(quantized_model, example_input, './compressed_model.onnx')
+#! [export]
 
 #! [save_checkpoint]
 checkpoint = {
     'state_dict': model.state_dict(),
-    'compression_state': compression_ctrl.get_compression_state(),
+    'nncf_config': model.nncf.get_config(),
     ... # the rest of the user-defined objects to save
 }
 torch.save(checkpoint, path_to_checkpoint)
@@ -55,8 +31,8 @@
 
 #! [load_checkpoint]
 resuming_checkpoint = torch.load(path_to_checkpoint)
-compression_state = resuming_checkpoint['compression_state'] 
-compression_ctrl, model = create_compressed_model(model, nncf_config, compression_state=compression_state)
-state_dict = resuming_checkpoint['state_dict'] 
+nncf_config = resuming_checkpoint['nncf_config']
+quantized_model = nncf.torch.load_from_config(model, nncf_config, example_input)
+state_dict = resuming_checkpoint['state_dict']
 model.load_state_dict(state_dict)
 #! [load_checkpoint]
diff --git a/docs/sphinx_setup/_static/css/custom.css b/docs/sphinx_setup/_static/css/custom.css
index f5f54b8b6f8d2b..49bcc5e3b73cb2 100644
--- a/docs/sphinx_setup/_static/css/custom.css
+++ b/docs/sphinx_setup/_static/css/custom.css
@@ -20,6 +20,14 @@ main  .searchForm {
 pre {
     white-space: pre-wrap;
     word-wrap: break-word;
+    background-color: white;
+    color: red !important;
+}
+
+button.copybtn {
+    opacity: 100;
+    border: white 1px solid;
+    background-color: white !important;
 }
 
 /* cookie wap requirement */
@@ -31,6 +39,7 @@ a#wap_dns {display: none;}
     background-repeat: no-repeat;
     background-image: url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 64 64' aria-labelledby='openvino github' aria-describedby='openvino github' role='img' xmlns:xlink='http://www.w3.org/1999/xlink'%3E%3Ctitle%3EGithub%3C/title%3E%3Cdesc%3EA solid styled icon from Orion Icon Library.%3C/desc%3E%3Cpath d='M32 0a32.021 32.021 0 0 0-10.1 62.4c1.6.3 2.2-.7 2.2-1.5v-6c-8.9 1.9-10.8-3.8-10.8-3.8-1.5-3.7-3.6-4.7-3.6-4.7-2.9-2 .2-1.9.2-1.9 3.2.2 4.9 3.3 4.9 3.3 2.9 4.9 7.5 3.5 9.3 2.7a6.93 6.93 0 0 1 2-4.3c-7.1-.8-14.6-3.6-14.6-15.8a12.27 12.27 0 0 1 3.3-8.6 11.965 11.965 0 0 1 .3-8.5s2.7-.9 8.8 3.3a30.873 30.873 0 0 1 8-1.1 30.292 30.292 0 0 1 8 1.1c6.1-4.1 8.8-3.3 8.8-3.3a11.965 11.965 0 0 1 .3 8.5 12.1 12.1 0 0 1 3.3 8.6c0 12.3-7.5 15-14.6 15.8a7.746 7.746 0 0 1 2.2 5.9v8.8c0 .9.6 1.8 2.2 1.5A32.021 32.021 0 0 0 32 0z' fill='rgb(255, 255, 255)'%3E%3C/path%3E%3Cpath %3E%3C/path%3E%3C/svg%3E ");
 }
+
 .fa-square-github path {
     fill: none;
 }
@@ -144,13 +153,10 @@ nav.bd-links .current>a {
     font-size: 2rem!important;
 }
 
-.bd-header .navbar-nav li a.nav-link:hover {
-    color: white;
-    text-decoration: none;
-}
-
-.bd-links__title {
+.svg-inline--fa .fa-outdent {
     display: none;
+    visibility: none;
+    color: white;
 }
 
 li.toctree-l1.has-children  > a.reference.internal {
@@ -196,6 +202,11 @@ nav.bd-links li>a {
     }
 }
 
+.download-docs .sst-dropdown .sst-btn {
+    border-color: lightgray !important;
+}
+
+
 /* Moving dropdown arrows to the left */
 details.sd-dropdown .sd-summary-up,
 details.sd-dropdown .sd-summary-down {
@@ -209,6 +220,11 @@ details.sd-dropdown:not([open]).sd-card {
     padding: 0px;
 }
 
+/* Ttile is at the same place for both open and close states */
+.sd-card-header {
+    border-radius: 0px !important;
+    
+}
 
 /* Ttile is at the same place for both open and close states */
 details.sd-dropdown[open].sd-card {
@@ -220,8 +236,6 @@ details.sd-dropdown .sd-summary-title {
     padding-left: 40px;
 }
 
-
-
 /* Second level items                 */
 #bd-docs-nav > div > ul > li > ul {
     padding-left: 0.3rem;
@@ -264,7 +278,6 @@ details.sd-dropdown .sd-summary-title {
     padding-right: 10px!important;
 }
 
-
 /* Code reference text formatting override */
 /* =================================================== */
 code {
@@ -283,9 +296,28 @@ code {
     background-color: #0054AE !important;
 }
 
-.admonition {
-    border-radius:0px !important;
+.admonition.tip, div.admonition.tip {
+    border-color: var(--pst-color-success) !important;
+    background-color: #effdf6 !important;
 }
+
+.admonition.important, div.admonition.important {
+    border-color: var(--pst-color-attention);
+    background-color: #fbf5f0 !important;
+}
+
+.admonition.warning, div.admonition.warning {
+    background-color: #fbf5f0 !important;
+}
+
+.admonition.note, div.admonition.note {
+    background-color: #f1fafe;
+}
+
+details.sd-dropdown summary.sd-card-header+div.sd-summary-content {
+    background-color: rgb(242, 248, 251);
+}
+
 /* Table Sort Button */
 /* =================================================== */
 .sort-header {
diff --git a/docs/sphinx_setup/_static/images/applying_low_latency.svg b/docs/sphinx_setup/_static/images/applying_low_latency.svg
deleted file mode 100644
index 68ab0c24149491..00000000000000
--- a/docs/sphinx_setup/_static/images/applying_low_latency.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3cec32fe436ce551bbd91a60eac39588f5ab9b599d14c6496b89a4e8a9a37909
-size 266752
diff --git a/docs/sphinx_setup/_static/images/configuration_dialog.png b/docs/sphinx_setup/_static/images/configuration_dialog.png
deleted file mode 100644
index 349fafc25e387f..00000000000000
--- a/docs/sphinx_setup/_static/images/configuration_dialog.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:55b68c91d4991dff5965d19e9b637848bbdcb49e75dbaae6af11d58fde7cf846
-size 20433
diff --git a/docs/sphinx_setup/_static/images/download_btn_github.svg b/docs/sphinx_setup/_static/images/download_btn_github.svg
deleted file mode 100644
index da039dacffccb5..00000000000000
--- a/docs/sphinx_setup/_static/images/download_btn_github.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c00e5b644b75ac4fe82907567b684552f703540bdd2948413b9a24d0a6762492
-size 1350
diff --git a/docs/sphinx_setup/_static/images/download_btn_installer.svg b/docs/sphinx_setup/_static/images/download_btn_installer.svg
deleted file mode 100644
index bb6fc4d16059ff..00000000000000
--- a/docs/sphinx_setup/_static/images/download_btn_installer.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:467a378cfe3fdfd298a195be51054f025d907696904905a899d5cfb1ba0532a2
-size 727
diff --git a/docs/sphinx_setup/_static/images/gapi_development_workflow.png b/docs/sphinx_setup/_static/images/gapi_development_workflow.png
deleted file mode 100644
index 658fdafe87a60a..00000000000000
--- a/docs/sphinx_setup/_static/images/gapi_development_workflow.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a0a11bedbfe2df3352b064e80498aa39fbc3817eaf99439865a090f34501e44a
-size 25936
diff --git a/docs/sphinx_setup/_static/images/gapi_face_analytics_pipeline.png b/docs/sphinx_setup/_static/images/gapi_face_analytics_pipeline.png
deleted file mode 100644
index 31f045c5d77ca2..00000000000000
--- a/docs/sphinx_setup/_static/images/gapi_face_analytics_pipeline.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:662a823fbef3be0cca1755de9118e73b4137fe7ec4b7cb6a389e64b9ec5a9c13
-size 13511
diff --git a/docs/sphinx_setup/_static/images/gapi_face_beautification_algorithm.png b/docs/sphinx_setup/_static/images/gapi_face_beautification_algorithm.png
deleted file mode 100644
index 7693c3b0fd825e..00000000000000
--- a/docs/sphinx_setup/_static/images/gapi_face_beautification_algorithm.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:12fe8e0b841aa6759f3b1975d3a877e65b8d72b752d11ffd212b67d11e62e048
-size 19539
diff --git a/docs/sphinx_setup/_static/images/gapi_face_beautification_example.jpg b/docs/sphinx_setup/_static/images/gapi_face_beautification_example.jpg
deleted file mode 100644
index eb3df6b58785bf..00000000000000
--- a/docs/sphinx_setup/_static/images/gapi_face_beautification_example.jpg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fb32d3db8768ff157daeff999cc7f4361d2bca866ed6dc95b8f78d8cc62ae208
-size 176525
diff --git a/docs/sphinx_setup/_static/images/gapi_kernel_implementation_hierarchy.png b/docs/sphinx_setup/_static/images/gapi_kernel_implementation_hierarchy.png
deleted file mode 100644
index f910caa840d191..00000000000000
--- a/docs/sphinx_setup/_static/images/gapi_kernel_implementation_hierarchy.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f291422f562825d4c5eee718b7c22e472b02a5a0a9c0be01d59b6b7cd8d756b1
-size 14603
diff --git a/docs/sphinx_setup/_static/images/gapi_programming_model.png b/docs/sphinx_setup/_static/images/gapi_programming_model.png
deleted file mode 100644
index 2ac10dcc82c13f..00000000000000
--- a/docs/sphinx_setup/_static/images/gapi_programming_model.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:925f70ede92d71e16733d78e003f62cd8bfdee0790bddbf2b7ce4fc8ef3f44bf
-size 171518
diff --git a/docs/sphinx_setup/_static/images/github.png b/docs/sphinx_setup/_static/images/github.png
deleted file mode 100644
index 4bf56a3d3e4799..00000000000000
--- a/docs/sphinx_setup/_static/images/github.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7b2749d93dff16fc9062a0fa01fd694bf19385a0b4f3d0e409eb56f2648e3cfc
-size 11929
diff --git a/docs/sphinx_setup/_static/images/head_banner.jpg b/docs/sphinx_setup/_static/images/head_banner.jpg
deleted file mode 100644
index 45773d26369077..00000000000000
--- a/docs/sphinx_setup/_static/images/head_banner.jpg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:07de2ee0d18f1d40afe1f2bb5420c7060eff09026a9138399d21b49b35cc0b8e
-size 184552
diff --git a/docs/sphinx_setup/_static/images/img/import_pytorch.svg b/docs/sphinx_setup/_static/images/img/import_pytorch.svg
deleted file mode 100644
index d1f8f5030e6566..00000000000000
--- a/docs/sphinx_setup/_static/images/img/import_pytorch.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7419b60d37a9bc058626c52fcbfec20c3a5d22c6d0875fb84ef0df7ec2a68671
-size 142191
diff --git a/docs/sphinx_setup/_static/images/img/import_tensorflow.svg b/docs/sphinx_setup/_static/images/img/import_tensorflow.svg
deleted file mode 100644
index 40d0534168133a..00000000000000
--- a/docs/sphinx_setup/_static/images/img/import_tensorflow.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d5666c2ee7503bc2844a99f73c1b64afacd2c42dadef441ce115cc18b00922c7
-size 224644
diff --git a/docs/sphinx_setup/_static/images/img/openvino-logo-purple-black.png b/docs/sphinx_setup/_static/images/img/openvino-logo-purple-black.png
deleted file mode 100644
index 6248a7820c50f7..00000000000000
--- a/docs/sphinx_setup/_static/images/img/openvino-logo-purple-black.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:61e237b3ced7eaa0cf1f8c2688753867b172712925068a4a47e07b5c71e48bdf
-size 89866
diff --git a/docs/sphinx_setup/_static/images/import_pytorch.svg b/docs/sphinx_setup/_static/images/import_pytorch.svg
deleted file mode 100644
index d1f8f5030e6566..00000000000000
--- a/docs/sphinx_setup/_static/images/import_pytorch.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7419b60d37a9bc058626c52fcbfec20c3a5d22c6d0875fb84ef0df7ec2a68671
-size 142191
diff --git a/docs/sphinx_setup/_static/images/import_tensorflow.svg b/docs/sphinx_setup/_static/images/import_tensorflow.svg
deleted file mode 100644
index 40d0534168133a..00000000000000
--- a/docs/sphinx_setup/_static/images/import_tensorflow.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d5666c2ee7503bc2844a99f73c1b64afacd2c42dadef441ce115cc18b00922c7
-size 224644
diff --git a/docs/sphinx_setup/_static/images/low_latency_limitation_1.svg b/docs/sphinx_setup/_static/images/low_latency_limitation_1.svg
deleted file mode 100644
index 90f947b28c9754..00000000000000
--- a/docs/sphinx_setup/_static/images/low_latency_limitation_1.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dcc7af1fddf966fba75aece332e1fabb722ef780d6935ada2ddbcf3bb229223e
-size 114289
diff --git a/docs/sphinx_setup/_static/images/model_conversion_diagram.svg b/docs/sphinx_setup/_static/images/model_conversion_diagram.svg
deleted file mode 100644
index 8bb8d171bd1eee..00000000000000
--- a/docs/sphinx_setup/_static/images/model_conversion_diagram.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ab1d83dbd1546cb8eaada19501cf08d26e3ca1e2ce72fce63356e897fa26750e
-size 253024
diff --git a/docs/sphinx_setup/_static/images/notebook_eye.png b/docs/sphinx_setup/_static/images/notebook_eye.png
deleted file mode 100644
index ecc13e7bdfba89..00000000000000
--- a/docs/sphinx_setup/_static/images/notebook_eye.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1a2e58cf3e5703356b0e060ebc7cb0cbb852db9cde003d41c1d86bafc3a4ccb1
-size 68559
diff --git a/docs/sphinx_setup/_static/images/openvino-install.png b/docs/sphinx_setup/_static/images/openvino-install.png
deleted file mode 100644
index adc5242bbb2fc4..00000000000000
--- a/docs/sphinx_setup/_static/images/openvino-install.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ef87640e224de61f41e76541e22a1392c84827dd0b7f70f3c616d86e75456aef
-size 8508
diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-cli.png b/docs/sphinx_setup/_static/images/openvino-uninstall-cli.png
deleted file mode 100644
index 654b79a5451a39..00000000000000
--- a/docs/sphinx_setup/_static/images/openvino-uninstall-cli.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cd260d96e1d8d425fba1eb2caf8b920e9c0511b421a81909babddca4ffa42dcb
-size 37617
diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-linux.png b/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-linux.png
deleted file mode 100644
index 57d514d1c182b9..00000000000000
--- a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-linux.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d0d7c0f692e14f3bb90d924d5ca25175e961963dac1d9a2dc6ca034f44d15863
-size 35667
diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-macos.png b/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-macos.png
deleted file mode 100644
index 672acf2468d7b6..00000000000000
--- a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-macos.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a67b8d8a8aafcb14e4334df138f526ace9a243e297511a0e89b3f0fafcaf003e
-size 33892
diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-win.png b/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-win.png
deleted file mode 100644
index 1d589ce2ad0ed0..00000000000000
--- a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-win.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5847979227bd81d4f8d1d5be532acd81c056466e226204be205565d00b69fa34
-size 34976
diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-linux.png b/docs/sphinx_setup/_static/images/openvino-uninstall-linux.png
deleted file mode 100644
index d22bee18602d7c..00000000000000
--- a/docs/sphinx_setup/_static/images/openvino-uninstall-linux.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5f276ad34326176aec19e93d3a277ede95096530e675991e71865b6edb6a5469
-size 42777
diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-macos.png b/docs/sphinx_setup/_static/images/openvino-uninstall-macos.png
deleted file mode 100644
index ebfbe68495fb3f..00000000000000
--- a/docs/sphinx_setup/_static/images/openvino-uninstall-macos.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:398636f71383bb2feff5492fcff3dcd7c7b30b155b7a7c219755d8bc40ef788c
-size 27305
diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-win.png b/docs/sphinx_setup/_static/images/openvino-uninstall-win.png
deleted file mode 100644
index 96206bf1bdfec8..00000000000000
--- a/docs/sphinx_setup/_static/images/openvino-uninstall-win.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5994af2e6d7c5403151e1dd3ed5741809590787b4490518b040bb30fe30d4cf3
-size 46941
diff --git a/docs/sphinx_setup/_static/images/range_supervision/img_combined_2.png b/docs/sphinx_setup/_static/images/range_supervision/img_combined_2.png
deleted file mode 100644
index 039e9a324d1d7b..00000000000000
--- a/docs/sphinx_setup/_static/images/range_supervision/img_combined_2.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:271ba164a9726a5cf8d577f02db258c76df94e9ff79c3bebf95371ebdaa7d82d
-size 1719169
diff --git a/docs/sphinx_setup/_static/images/range_supervision/scheme3.svg b/docs/sphinx_setup/_static/images/range_supervision/scheme3.svg
deleted file mode 100644
index 4504c1d3b94758..00000000000000
--- a/docs/sphinx_setup/_static/images/range_supervision/scheme3.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2545abc4e5d26f6eb52c832cafa9ac1319958dfd7d550922e13cfcab44f1379c
-size 68280
diff --git a/docs/sphinx_setup/_static/images/selection_dialog.png b/docs/sphinx_setup/_static/images/selection_dialog.png
deleted file mode 100644
index 82ae960c8ad518..00000000000000
--- a/docs/sphinx_setup/_static/images/selection_dialog.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:194b9b8026456b9bb7d05834ffebc44192e028c0338369f0c77afc4772192a01
-size 18851
diff --git a/docs/sphinx_setup/_static/images/state_network_example.svg b/docs/sphinx_setup/_static/images/state_network_example.svg
deleted file mode 100644
index 56d695015077bd..00000000000000
--- a/docs/sphinx_setup/_static/images/state_network_example.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8665a06ce99edcb4ccdade52b7fc5e2ae2a4810c5099cc35ffbb23d48fae56b5
-size 16970
diff --git a/docs/sphinx_setup/_static/images/supported_devices.png b/docs/sphinx_setup/_static/images/supported_devices.png
deleted file mode 100644
index ff117bd8d61f34..00000000000000
--- a/docs/sphinx_setup/_static/images/supported_devices.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:566aab6ef86a50dad4fba5483a9b0abffc85778dccee7a0c7e98d4b09447f9b1
-size 130586
diff --git a/docs/sphinx_setup/_static/images/torch_compile_backend_openvino_ts.svg b/docs/sphinx_setup/_static/images/torch_compile_backend_openvino_ts.svg
deleted file mode 100644
index 4be98857e767f5..00000000000000
--- a/docs/sphinx_setup/_static/images/torch_compile_backend_openvino_ts.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0e26fe889ada0e02a3bbc03e451a7e1d4b06037723349971efff1d721b5e13f6
-size 117253
diff --git a/docs/sphinx_setup/_static/js/modern.js b/docs/sphinx_setup/_static/js/modern.js
new file mode 100644
index 00000000000000..dae212a07215db
--- /dev/null
+++ b/docs/sphinx_setup/_static/js/modern.js
@@ -0,0 +1,12 @@
+$(document).ready(function() {
+    const elems = $( 'details.sd-dropdown' );
+    for(let i = 0; i < elems.length; i++){
+        elems[i].style.cssText = 'box-shadow: none !important; border: 1px !important;'
+    }
+
+
+    const admonitions = $( '.admonition' );
+    for(let i = 0; i < admonitions.length; i++){
+        admonitions[i].style.cssText = 'box-shadow: none !important; border-radius:0px !important; '
+    }
+})
\ No newline at end of file
diff --git a/docs/sphinx_setup/conf.py b/docs/sphinx_setup/conf.py
index c85a612be760b9..669e15760ae27b 100644
--- a/docs/sphinx_setup/conf.py
+++ b/docs/sphinx_setup/conf.py
@@ -199,6 +199,7 @@
     'js/papaparse.min.js',
     'js/viewer.min.js',
     'js/custom.js',
+    'js/modern.js',
 ]
 
 # monkeypatch sphinx api doc to prevent showing inheritance from object and enum.Enum
diff --git a/docs/sphinx_setup/index.rst b/docs/sphinx_setup/index.rst
index 3b4c1d48347602..fcccf196e94fdf 100644
--- a/docs/sphinx_setup/index.rst
+++ b/docs/sphinx_setup/index.rst
@@ -57,7 +57,7 @@ Check out the `OpenVINO Cheat Sheet. <https://docs.openvino.ai/2024/_static/down
 |
 |
 
-.. image:: _static/images/openvino-overview-diagram.jpg
+.. image:: ./_static/images/openvino-overview-diagram.jpg
    :align: center
    :alt: openvino diagram
 
diff --git a/src/bindings/c/docs/api_overview.md b/src/bindings/c/docs/api_overview.md
index 6ca2ad403c1a7e..3deedeb5ddef65 100644
--- a/src/bindings/c/docs/api_overview.md
+++ b/src/bindings/c/docs/api_overview.md
@@ -181,8 +181,14 @@ typedef enum {
 
     U1,              //!< binary element type
 
+    U2,              //!< u2 element type
+
+    U3,              //!< u3 element type
+
     U4,              //!< u4 element type
 
+    U6,              //!< u6 element type
+
     U8,              //!< u8 element type
 
     U16,             //!< u16 element type
@@ -193,6 +199,12 @@ typedef enum {
 
     NF4,             //!< nf4 element type
 
+    F8E4M3,          //!< f8e4m3 element type
+
+    F8E5M3,          //!< f8e5m2 element type
+
+    STRING,          //!< string element type
+
 } ov_element_type_e;
 ```
 
diff --git a/src/bindings/c/include/openvino/c/ov_common.h b/src/bindings/c/include/openvino/c/ov_common.h
index daebbb7b656587..22ac486fc819cc 100644
--- a/src/bindings/c/include/openvino/c/ov_common.h
+++ b/src/bindings/c/include/openvino/c/ov_common.h
@@ -165,7 +165,8 @@ typedef enum {
 /**
  * @enum ov_element_type_e
  * @ingroup ov_base_c_api
- * @brief This enum contains codes for element type.
+ * @brief This enum contains codes for element type, which is aligned with ov::element::Type_t in
+ * src/core/include/openvino/core/type/element_type.hpp
  */
 typedef enum {
     UNDEFINED = 0U,  //!< Undefined element type
@@ -181,7 +182,10 @@ typedef enum {
     I32,             //!< i32 element type
     I64,             //!< i64 element type
     U1,              //!< binary element type
+    U2,              //!< u2 element type
+    U3,              //!< u3 element type
     U4,              //!< u4 element type
+    U6,              //!< u6 element type
     U8,              //!< u8 element type
     U16,             //!< u16 element type
     U32,             //!< u32 element type
@@ -189,6 +193,7 @@ typedef enum {
     NF4,             //!< nf4 element type
     F8E4M3,          //!< f8e4m3 element type
     F8E5M3,          //!< f8e5m2 element type
+    STRING,          //!< string element type
 } ov_element_type_e;
 
 /**
diff --git a/src/bindings/c/src/common.h b/src/bindings/c/src/common.h
index cec3e9999c430b..bf41e520cfa8fd 100644
--- a/src/bindings/c/src/common.h
+++ b/src/bindings/c/src/common.h
@@ -209,5 +209,6 @@ struct mem_istream : virtual mem_stringbuf, std::istream {
 };
 
 char* str_to_char_array(const std::string& str);
+ov_element_type_e find_ov_element_type_e(ov::element::Type type);
 ov::element::Type get_element_type(ov_element_type_e type);
 void dup_last_err_msg(const char* msg);
diff --git a/src/bindings/c/src/ov_node.cpp b/src/bindings/c/src/ov_node.cpp
index b52eda58faa715..5d920a02bed4fc 100644
--- a/src/bindings/c/src/ov_node.cpp
+++ b/src/bindings/c/src/ov_node.cpp
@@ -87,7 +87,7 @@ ov_status_e ov_port_get_element_type(const ov_output_const_port_t* port, ov_elem
 
     try {
         auto type = (ov::element::Type_t)port->object->get_element_type();
-        *tensor_type = (ov_element_type_e)type;
+        *tensor_type = find_ov_element_type_e(type);
     }
     CATCH_OV_EXCEPTIONS
 
diff --git a/src/bindings/c/src/ov_tensor.cpp b/src/bindings/c/src/ov_tensor.cpp
index 3ad7d408add000..952f9cb394ba92 100644
--- a/src/bindings/c/src/ov_tensor.cpp
+++ b/src/bindings/c/src/ov_tensor.cpp
@@ -19,16 +19,20 @@ const std::map<ov_element_type_e, ov::element::Type> element_type_map = {
     {ov_element_type_e::I32, ov::element::i32},
     {ov_element_type_e::I64, ov::element::i64},
     {ov_element_type_e::U1, ov::element::u1},
+    {ov_element_type_e::U2, ov::element::u2},
+    {ov_element_type_e::U3, ov::element::u3},
     {ov_element_type_e::U4, ov::element::u4},
+    {ov_element_type_e::U6, ov::element::u6},
     {ov_element_type_e::U8, ov::element::u8},
     {ov_element_type_e::U16, ov::element::u16},
     {ov_element_type_e::U32, ov::element::u32},
     {ov_element_type_e::U64, ov::element::u64},
     {ov_element_type_e::NF4, ov::element::nf4},
     {ov_element_type_e::F8E4M3, ov::element::f8e4m3},
-    {ov_element_type_e::F8E5M3, ov::element::f8e5m2}};
+    {ov_element_type_e::F8E5M3, ov::element::f8e5m2},
+    {ov_element_type_e::STRING, ov::element::string}};
 
-inline ov_element_type_e find_ov_element_type_e(ov::element::Type type) {
+ov_element_type_e find_ov_element_type_e(ov::element::Type type) {
     for (auto iter = element_type_map.begin(); iter != element_type_map.end(); iter++) {
         if (iter->second == type) {
             return iter->first;
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
index 56f5cd79a61480..596c4415868910 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
@@ -56,6 +56,15 @@ def __init__(self, pt_module, fx_gm=None, nodes=None, mark_node_callback=None, i
                     uargs = self.unpack_containers(self._nodes[i].args)
                     self._outputs = [(arg[0], self._nodes.index(arg[1]))
                                      for arg in uargs if arg[1] is not None]
+            for idx, shape in enumerate(found_shapes):
+                if shape is not None:
+                    new_shape=[]
+                    for dim in range(0, len(shape)):
+                        if (type(shape[dim]).__name__ == "SymInt"):
+                            new_shape.append(-1)
+                        else:
+                            new_shape.append(shape[dim])
+                    found_shapes[idx] = torch.Size(new_shape)
 
             if not input_shapes or len(input_shapes) == 0:
                 self.input_shapes = found_shapes
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py
index fb7438aa78295e..4947589a77fd22 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py
@@ -54,7 +54,11 @@ def openvino(subgraph, example_inputs, options=None):
     if (_get_aot_autograd(options)):
         global openvino_options
         openvino_options = options
-        return aot_autograd(fw_compiler=fx_openvino, bw_compiler=fx_openvino)(subgraph, example_inputs)
+        decompositions = _get_decompositions(options) + get_inf_decomposition_list()
+        decompositions = decompositions + get_aot_decomposition_list()
+        return aot_autograd(fw_compiler=fx_openvino, 
+                            bw_compiler=fx_openvino, 
+                            decompositions=get_decompositions(decompositions))(subgraph, example_inputs)
     return fx_openvino(subgraph, example_inputs, options)
 
 def fx_openvino(subgraph, example_inputs, options=None):
@@ -82,15 +86,17 @@ def _call(*args):
         if inputs_reversed:
             example_inputs.reverse()
 
-        from torch._subclasses.fake_tensor import FakeTensorMode
-        decompositions = _get_decompositions(options) + get_inf_decomposition_list()
         if (_get_aot_autograd(options)):
-            decompositions = decompositions + get_aot_decomposition_list()
-        with FakeTensorMode(allow_non_fake_inputs=True):
-            model = make_fx(subgraph, decomposition_table=get_decompositions(decompositions))(*example_inputs)
+            model = subgraph
+        else:
+            from torch._subclasses.fake_tensor import FakeTensorMode
+            decompositions = _get_decompositions(options) + get_inf_decomposition_list()
+            with FakeTensorMode(allow_non_fake_inputs=True):
+                model = make_fx(subgraph, decomposition_table=get_decompositions(decompositions))(*example_inputs)
+
+            with torch.no_grad():
+                model.eval()
 
-        with torch.no_grad():
-            model.eval()
         partitioner = Partitioner(options)
         compiled_model = partitioner.make_partitions(model, options)
 
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py
index 91192e4110d2bb..fa446893a05d07 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py
@@ -94,10 +94,14 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, options
         input_shapes = []
         input_types = []
         for idx, input_data in enumerate(args):
-            input_types.append(input_data.type())
-            input_shapes.append(input_data.size())
+            if isinstance(input_data, int):
+                input_types.append(torch.int64)
+                input_shapes.append(torch.Size([1]))
+            else:
+                input_types.append(input_data.type())
+                input_shapes.append(input_data.size())
 
-        decoder = TorchFXPythonDecoder(gm, input_shapes=input_shapes, input_types=input_types)
+        decoder = TorchFXPythonDecoder(gm)
 
         im = fe.load(decoder)
 
@@ -118,8 +122,13 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, options
     }
 
     for idx, input_data in enumerate(args):
-        om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype])
-        om.inputs[idx].get_node().set_partial_shape(PartialShape(list(input_data.shape)))
+        if isinstance(input_data, int):
+            om.inputs[idx].get_node().set_element_type(dtype_mapping[torch.int64])
+            om.inputs[idx].get_node().set_partial_shape(PartialShape(list(torch.Size([1]))))
+        else:
+            om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype])
+            om.inputs[idx].get_node().set_partial_shape(PartialShape(list(decoder.input_shapes[idx])))
+
     om.validate_nodes_and_infer_types()
 
     config = _get_config(options)
@@ -129,4 +138,4 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, options
             config["CACHE_DIR"] = cache_root
 
     compiled = core.compile_model(om, device, config)
-    return compiled
\ No newline at end of file
+    return compiled
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py
index 8d04efaa71ab8a..4f41f7b5a6a9de 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py
@@ -21,7 +21,7 @@
 from openvino.frontend.pytorch.torchdynamo.partition import Partitioner
 from openvino.frontend.pytorch.torchdynamo.compile import openvino_compile
 from openvino.runtime import Core, Type, PartialShape
-from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device
+from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device, _get_aot_autograd
 
 from typing import Callable, Optional, Any
 
@@ -40,6 +40,7 @@
 )
 
 compiled_cache = {}
+req_cache = {}
 max_openvino_partitions = 0
 partitioned_modules = {}
 
@@ -91,14 +92,19 @@ def openvino_execute(gm: GraphModule, *args, executor_parameters=None, partition
 
     if use_cache and (partition_id in compiled_cache):
         compiled = compiled_cache[partition_id]
+        req = req_cache[partition_id]
     else:
         compiled = openvino_compile(gm, *args, model_hash_str=model_hash_str, options=options)
         compiled_cache[partition_id] = compiled
+        req = compiled.create_infer_request()
+        req_cache[partition_id] = req
 
     flat_args, _ = tree_flatten(args)
-    ov_inputs = [a.detach().cpu().numpy() for a in flat_args]
+    ov_inputs = []
+    for arg in flat_args:
+        ov_inputs.append((arg if isinstance(arg, int) else arg.detach().cpu().numpy()))
 
-    res = compiled(ov_inputs)
+    res = req.infer(ov_inputs, share_inputs=True, share_outputs=True)
 
     results1 = [torch.from_numpy(res[out]) for out in compiled.outputs]
     if len(results1) == 1:
@@ -123,7 +129,7 @@ def __call__(self, *args):
         try:
             result = openvino_execute(self.gm, *args, executor_parameters=self.executor_parameters, partition_id=self.partition_id, options=self.options)
         except Exception:
-            logger.warning("OpenVINO execution failed. Falling back to native PyTorch execution.")
+            logger.debug("OpenVINO execution failed. Falling back to native PyTorch execution.")
             self.perm_fallback = True
             return self.gm(*args)
 
@@ -162,11 +168,12 @@ def openvino_execute_partitioned(gm: GraphModule, *args, executor_parameters=Non
     model_hash_str = executor_parameters.get("model_hash_str", None)
 
     signature = str(id(gm))
-    for idx, input_data in enumerate(args):
-        if isinstance(input_data, torch.Tensor):
-            signature = signature + "_" + str(idx) + ":" + str(input_data.type())[6:] + ":" + str(input_data.size())[11:-1].replace(" ", "")
-        else:
-            signature = signature + "_" + str(idx) + ":" + type(input_data).__name__ + ":val(" + str(input_data) + ")"
+    if (not _get_aot_autograd(options)):
+        for idx, input_data in enumerate(args):
+            if isinstance(input_data, torch.Tensor):
+                signature = signature + "_" + str(idx) + ":" + str(input_data.type())[6:] + ":" + str(input_data.size())[11:-1].replace(" ", "")
+            else:
+                signature = signature + "_" + str(idx) + ":" + type(input_data).__name__ + ":val(" + str(input_data) + ")"
 
     if signature not in partitioned_modules:
         partitioned_modules[signature] = partition_graph(gm, use_python_fusion_cache=use_python_fusion_cache,
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py
index 8f2ba4906b46bc..c2d08bd14638df 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py
@@ -28,7 +28,12 @@ class OperatorSupport(OperatorSupport):
 
     def __init__(self, options):
         support_dict = {
+            "_operator.add": None,
+            "_operator.floordiv": None,
             "_operator.getitem": None,
+            "_operator.mul": None,
+            "_operator.sub": None,
+            "torch.ops.aten.sym_size.int": None,
             "torch.ops.aten._adaptive_avg_pool1d.default": None,
             "torch.ops.aten._adaptive_avg_pool2d.default": None,
             "torch.ops.aten._adaptive_avg_pool3d.default": None,
diff --git a/src/bindings/python/src/pyopenvino/core/common.cpp b/src/bindings/python/src/pyopenvino/core/common.cpp
index 5d8a4a73d6b312..9f57b794e2bff6 100644
--- a/src/bindings/python/src/pyopenvino/core/common.cpp
+++ b/src/bindings/python/src/pyopenvino/core/common.cpp
@@ -37,35 +37,37 @@ py::dtype get_dtype(const ov::element::Type& ov_type) {
     return ov_type_to_dtype().at(ov_type);
 }
 
-const std::map<int, ov::element::Type>& dtype_num_to_ov_type() {
-    static const std::map<int, ov::element::Type> dtype_to_ov_type_mapping = {
-        {23, ov::element::f16},  // float16
-        {11, ov::element::f32},  // float32
-        {12, ov::element::f64},  // float64
-        {1, ov::element::i8},    // int8
-        {3, ov::element::i16},   // int16
-#ifdef _WIN32
-        {7, ov::element::i32},  // int32
-        {9, ov::element::i64},  // int64
-#else
-        {5, ov::element::i32},  // int32
-        {7, ov::element::i64},  // int64
-#endif
-        {2, ov::element::u8},   // uint8
-        {4, ov::element::u16},  // uint16
-#ifdef _WIN32
-        {8, ov::element::u32},   // uint32
-        {10, ov::element::u64},  // uint64
-#else
-        {6, ov::element::u32},  // uint32
-        {8, ov::element::u64},  // uint64
-#endif
-        {0, ov::element::boolean},  // bool
-        {18, ov::element::string},  // bytes_
-        {19, ov::element::string},  // str_
-        {18, ov::element::string},  // bytes
-        {19, ov::element::string},  // str
+std::map<int, ov::element::Type> init_num_to_ov_type() {
+    static const std::map<std::string, ov::element::Type> str_to_type_mapping = {
+        {"float16", ov::element::f16},
+        {"float32", ov::element::f32},
+        {"float64", ov::element::f64},
+        {"int8", ov::element::i8},
+        {"int16", ov::element::i16},
+        {"int32", ov::element::i32},
+        {"int64", ov::element::i64},
+        {"uint8", ov::element::u8},
+        {"uint16", ov::element::u16},
+        {"uint32", ov::element::u32},
+        {"uint64", ov::element::u64},
+        {"bool", ov::element::boolean},
+        {"bytes_", ov::element::string},
+        {"str_", ov::element::string},
+        {"bytes", ov::element::string},
+        {"str", ov::element::string},
     };
+
+    std::map<int, ov::element::Type> int_to_type_mapping;
+
+    for (const auto& e : str_to_type_mapping) {
+        int_to_type_mapping[py::dtype(e.first).num()] = e.second;
+    }
+
+    return int_to_type_mapping;
+}
+
+const std::map<int, ov::element::Type>& dtype_num_to_ov_type() {
+    static const std::map<int, ov::element::Type> dtype_to_ov_type_mapping = init_num_to_ov_type();
     return dtype_to_ov_type_mapping;
 }
 
diff --git a/src/bindings/python/src/pyopenvino/core/common.hpp b/src/bindings/python/src/pyopenvino/core/common.hpp
index 374a31e7ece5f7..2311855a8ad34e 100644
--- a/src/bindings/python/src/pyopenvino/core/common.hpp
+++ b/src/bindings/python/src/pyopenvino/core/common.hpp
@@ -47,6 +47,8 @@ const std::map<ov::element::Type, py::dtype>& ov_type_to_dtype();
 
 py::dtype get_dtype(const ov::element::Type& ov_type);
 
+std::map<int, ov::element::Type> init_num_to_ov_type();
+
 const std::map<int, ov::element::Type>& dtype_num_to_ov_type();
 
 ov::element::Type get_ov_type(const py::array& array);
diff --git a/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp b/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp
index 2094bbf085bc26..579f5f56114dcf 100644
--- a/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp
+++ b/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp
@@ -50,6 +50,9 @@ TEST_F(TransformationTestsF, RMSNormFusionTest1) {
 
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{rms}, ov::ParameterVector{input});
     }
+    comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
 }
 
 TEST_F(TransformationTestsF, RMSNormFusionTest2) {
diff --git a/src/core/include/openvino/op/roi_align_rotated.hpp b/src/core/include/openvino/op/roi_align_rotated.hpp
index 500988e8745971..5e7bf95fa68bcc 100644
--- a/src/core/include/openvino/op/roi_align_rotated.hpp
+++ b/src/core/include/openvino/op/roi_align_rotated.hpp
@@ -8,13 +8,13 @@
 
 namespace ov {
 namespace op {
-namespace v14 {
+namespace v15 {
 /// \brief ROIAlignRotated operation.
 ///
 /// \ingroup ov_ops_cpp_api
 class OPENVINO_API ROIAlignRotated : public util::ROIAlignBase {
 public:
-    OPENVINO_OP("ROIAlignRotated", "opset14", util::ROIAlignBase);
+    OPENVINO_OP("ROIAlignRotated", "opset15", util::ROIAlignBase);
 
     ROIAlignRotated() = default;
     /// \brief Constructs a ROIAlignRotated operation.
@@ -57,6 +57,6 @@ class OPENVINO_API ROIAlignRotated : public util::ROIAlignBase {
 private:
     bool m_clockwise_mode;
 };
-}  // namespace v14
+}  // namespace v15
 }  // namespace op
 }  // namespace ov
diff --git a/src/core/include/openvino/opsets/opset14_tbl.hpp b/src/core/include/openvino/opsets/opset14_tbl.hpp
index f96544d65edf81..1e1c520e475852 100644
--- a/src/core/include/openvino/opsets/opset14_tbl.hpp
+++ b/src/core/include/openvino/opsets/opset14_tbl.hpp
@@ -221,4 +221,3 @@ _OPENVINO_OP_REG(FakeConvert, ov::op::v13)
 // New operations added in opset14
 _OPENVINO_OP_REG(ConvertPromoteTypes, ov::op::v14)
 _OPENVINO_OP_REG(Inverse, ov::op::v14)
-_OPENVINO_OP_REG(ROIAlignRotated, ov::op::v14)
diff --git a/src/core/include/openvino/opsets/opset15_tbl.hpp b/src/core/include/openvino/opsets/opset15_tbl.hpp
index 33e433e0a30afa..50c8603cf2046c 100644
--- a/src/core/include/openvino/opsets/opset15_tbl.hpp
+++ b/src/core/include/openvino/opsets/opset15_tbl.hpp
@@ -13,6 +13,7 @@ _OPENVINO_OP_REG(Convert, ov::op::v0)
 _OPENVINO_OP_REG(ShapeOf, ov::op::v3)
 
 // New operations added in opset15
+_OPENVINO_OP_REG(ROIAlignRotated, ov::op::v15)
 _OPENVINO_OP_REG(ScatterNDUpdate, ov::op::v15)
 _OPENVINO_OP_REG(EmbeddingBagPacked, ov::op::v15)
 _OPENVINO_OP_REG(EmbeddingBagOffsets, ov::op::v15)
diff --git a/src/core/reference/include/openvino/reference/rms_norm.hpp b/src/core/reference/include/openvino/reference/rms_norm.hpp
index 15b327596bf643..b235813a264a44 100644
--- a/src/core/reference/include/openvino/reference/rms_norm.hpp
+++ b/src/core/reference/include/openvino/reference/rms_norm.hpp
@@ -8,6 +8,7 @@
 #include <cstddef>
 
 #include "openvino/reference/add.hpp"
+#include "openvino/reference/convert.hpp"
 #include "openvino/reference/divide.hpp"
 #include "openvino/reference/multiply.hpp"
 #include "openvino/reference/power.hpp"
@@ -72,5 +73,33 @@ void rms_norm(const T* in,
     rms_norm(in, axes, out, in_shape, eps);
     multiply(out, scale, out, in_shape, scale_shape, op::AutoBroadcastType::NUMPY);
 }
+
+/**
+ * @brief Reference implementation of RMS operator with output type conversion
+ *
+ *  Math Formula: Convert((x / Sqrt(ReduceMean(x^2, axes) + eps)) * scale), T_OUT)
+ *
+ * @param in           Input pointer to data
+ * @param axes         Axes for reduce mean calculation
+ * @param out          Output pointer to results
+ * @param in_shape     Shape of the input Tensor
+ * @param eps          Epsilon for not dividing by zero while normalizing the value
+ * @param scale_shape  Shape of the scale Tensor
+ * @param scale        Input pointer to scale
+ *
+ */
+template <class T_IN, class T_OUT>
+void rms_norm_mul_convert_out(const T_IN* in,
+                              const AxisSet& axes,
+                              T_OUT* out,
+                              const Shape& in_shape,
+                              double eps,
+                              const Shape& scale_shape,
+                              const T_IN* scale) {
+    std::vector<T_IN> tmp_out(shape_size(in_shape));
+    rms_norm(in, axes, tmp_out.data(), in_shape, eps, scale_shape, scale);
+    convert(tmp_out.data(), out, tmp_out.size());
+}
+
 }  // namespace reference
 }  // namespace ov
diff --git a/src/core/shape_inference/include/roi_align_rotated_shape_inference.hpp b/src/core/shape_inference/include/roi_align_rotated_shape_inference.hpp
index 5faeaefa19ad21..0cdcf95dde0b54 100644
--- a/src/core/shape_inference/include/roi_align_rotated_shape_inference.hpp
+++ b/src/core/shape_inference/include/roi_align_rotated_shape_inference.hpp
@@ -8,12 +8,12 @@
 
 namespace ov {
 namespace op {
-namespace v14 {
+namespace v15 {
 class ROIAlignRotated;
 template <class TShape, class TRShape = result_shape_t<TShape>>
 std::vector<TRShape> shape_infer(const ROIAlignRotated* op, const std::vector<TShape>& input_shapes) {
     return roi_align::shape_infer<TShape, TRShape>(op, input_shapes);
 }
-}  // namespace v14
+}  // namespace v15
 }  // namespace op
 }  // namespace ov
diff --git a/src/core/src/op/roi_align_rotated.cpp b/src/core/src/op/roi_align_rotated.cpp
index 4eec291bc468d7..0bdec26fa15476 100644
--- a/src/core/src/op/roi_align_rotated.cpp
+++ b/src/core/src/op/roi_align_rotated.cpp
@@ -8,7 +8,7 @@
 
 namespace ov {
 namespace op {
-namespace v14 {
+namespace v15 {
 ROIAlignRotated::ROIAlignRotated(const Output<Node>& input,
                                  const Output<Node>& rois,
                                  const Output<Node>& batch_indices,
@@ -49,6 +49,6 @@ std::shared_ptr<Node> ROIAlignRotated::clone_with_new_inputs(const OutputVector&
                                              get_spatial_scale(),
                                              get_clockwise_mode());
 }
-}  // namespace v14
+}  // namespace v15
 }  // namespace op
 }  // namespace ov
diff --git a/src/core/tests/opset.cpp b/src/core/tests/opset.cpp
index e98c8644236592..c63b4759287a2a 100644
--- a/src/core/tests/opset.cpp
+++ b/src/core/tests/opset.cpp
@@ -74,8 +74,8 @@ INSTANTIATE_TEST_SUITE_P(opset,
                                          OpsetTestParams{ov::get_opset11, 177},
                                          OpsetTestParams{ov::get_opset12, 178},
                                          OpsetTestParams{ov::get_opset13, 186},
-                                         OpsetTestParams{ov::get_opset14, 189},
-                                         OpsetTestParams{ov::get_opset15, 7}),
+                                         OpsetTestParams{ov::get_opset14, 188},
+                                         OpsetTestParams{ov::get_opset15, 8}),
                          OpsetTestNameGenerator{});
 
 class MyOpOld : public ov::op::Op {
diff --git a/src/core/tests/type_prop/roi_align.cpp b/src/core/tests/type_prop/roi_align.cpp
index 964de695315f79..8bf7f734a07616 100644
--- a/src/core/tests/type_prop/roi_align.cpp
+++ b/src/core/tests/type_prop/roi_align.cpp
@@ -32,7 +32,7 @@ class ROIAlignTest : public testing::Test {
 
     ov::Dimension::value_type GetROISecondDimSizeForOp() const {
         // Those magic numbers comes from definition of ROIAlign ops.
-        if (std::is_same<TOp, op::v14::ROIAlignRotated>::value)
+        if (std::is_same<TOp, op::v15::ROIAlignRotated>::value)
             return 5;
         return 4;
     }
@@ -281,5 +281,5 @@ REGISTER_TYPED_TEST_SUITE_P(ROIAlignTest,
                             data_and_rois_not_same_type,
                             batch_indicies_not_integer);
 
-typedef Types<op::v3::ROIAlign, op::v9::ROIAlign, op::v14::ROIAlignRotated> ROIAlignTypes;
+typedef Types<op::v3::ROIAlign, op::v9::ROIAlign, op::v15::ROIAlignRotated> ROIAlignTypes;
 INSTANTIATE_TYPED_TEST_SUITE_P(type_prop, ROIAlignTest, ROIAlignTypes);
diff --git a/src/frontends/onnx/frontend/src/op/mmdeploy_roi_align_rotated.cpp b/src/frontends/onnx/frontend/src/op/mmdeploy_roi_align_rotated.cpp
index 588cf07646bf7d..565c73674c75fd 100644
--- a/src/frontends/onnx/frontend/src/op/mmdeploy_roi_align_rotated.cpp
+++ b/src/frontends/onnx/frontend/src/op/mmdeploy_roi_align_rotated.cpp
@@ -56,7 +56,7 @@ ov::OutputVector mmdeploy_roi_align_rotated(const ov::frontend::onnx::Node& node
     const auto spatial_scale = node.get_attribute_value<float>("spatial_scale", 1.0f);
     const auto clockwise = static_cast<bool>(node.get_attribute_value<int64_t>("clockwise", 0));
 
-    return {std::make_shared<v14::ROIAlignRotated>(data,
+    return {std::make_shared<v15::ROIAlignRotated>(data,
                                                    rois,
                                                    rois_batch_idx,
                                                    static_cast<int>(pooled_h),
diff --git a/src/frontends/onnx/frontend/src/op/reduce.cpp b/src/frontends/onnx/frontend/src/op/reduce.cpp
index 9ba7e6c36fa9e8..ed4aaa5d13dc45 100644
--- a/src/frontends/onnx/frontend/src/op/reduce.cpp
+++ b/src/frontends/onnx/frontend/src/op/reduce.cpp
@@ -136,6 +136,14 @@ std::shared_ptr<ov::Node> make_ov_reduction_op(const Node& node,
         return set_1::identity(node).at(0).get_node_shared_ptr();
     }
 }
+
+std::shared_ptr<ov::Node> onnx_reduce_sum_square(const ov::frontend::onnx::Node& node,
+                                                 const std::set<element::Type>& supported_types,
+                                                 const bool axes_as_attr = true) {
+    const auto input = ov::Output<ov::Node>{node.get_ov_inputs().at(0)};
+    const auto square_node = std::make_shared<v1::Multiply>(input, input);
+    return make_ov_reduction_op<v1::ReduceSum>(node, square_node, supported_types, axes_as_attr);
+}
 }  // namespace
 
 namespace set_1 {
@@ -180,9 +188,7 @@ ov::OutputVector reduce_sum(const ov::frontend::onnx::Node& node) {
 }
 
 ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node) {
-    const auto input = ov::Output<ov::Node>{node.get_ov_inputs().at(0)};
-    const auto square_node = std::make_shared<v1::Multiply>(input, input);
-    return {make_ov_reduction_op<v1::ReduceSum>(node, square_node, supported_types_v1)};
+    return {onnx_reduce_sum_square(node, supported_types_v1)};
 }
 }  // namespace set_1
 
@@ -199,9 +205,11 @@ namespace set_13 {
 ov::OutputVector reduce_sum(const ov::frontend::onnx::Node& node) {
     return {make_ov_reduction_op<v1::ReduceSum>(node, node.get_ov_inputs().at(0), supported_types_v2, false)};
 }
+
 ov::OutputVector reduce_l2(const Node& node) {
     return {make_ov_reduction_op<v4::ReduceL2>(node, node.get_ov_inputs().at(0), supported_types_v2)};
 }
+
 ov::OutputVector reduce_max(const ov::frontend::onnx::Node& node) {
     return {make_ov_reduction_op<v1::ReduceMax>(node, node.get_ov_inputs().at(0), supported_types_v3)};
 }
@@ -209,6 +217,10 @@ ov::OutputVector reduce_max(const ov::frontend::onnx::Node& node) {
 ov::OutputVector reduce_min(const ov::frontend::onnx::Node& node) {
     return {make_ov_reduction_op<v1::ReduceMin>(node, node.get_ov_inputs().at(0), supported_types_v3)};
 }
+
+ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node) {
+    return {onnx_reduce_sum_square(node, supported_types_v2)};
+}
 }  // namespace set_13
 
 namespace set_18 {
@@ -228,6 +240,10 @@ ov::OutputVector reduce_log_sum(const ov::frontend::onnx::Node& node) {
         make_ov_reduction_op<v1::ReduceSum>(node, node.get_ov_inputs().at(0), supported_types_v2, false);
     return {std::make_shared<v0::Log>(sum_node)};
 }
+
+ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node) {
+    return {onnx_reduce_sum_square(node, supported_types_v2, false)};
+}
 }  // namespace set_18
 
 namespace set_20 {
diff --git a/src/frontends/onnx/frontend/src/op/reduce.hpp b/src/frontends/onnx/frontend/src/op/reduce.hpp
index f361c89fd2d960..740a6127fc3303 100644
--- a/src/frontends/onnx/frontend/src/op/reduce.hpp
+++ b/src/frontends/onnx/frontend/src/op/reduce.hpp
@@ -79,6 +79,12 @@ ov::OutputVector reduce_sum(const ov::frontend::onnx::Node& node);
 namespace set_1 {
 ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node);
 }  // namespace set_1
+namespace set_13 {
+ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node);
+}  // namespace set_13
+namespace set_18 {
+ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node);
+}  // namespace set_18
 
 }  // namespace op
 }  // namespace onnx
diff --git a/src/frontends/onnx/frontend/src/ops_bridge.cpp b/src/frontends/onnx/frontend/src/ops_bridge.cpp
index 02a79979ecaa73..5ab8a792c1fefa 100644
--- a/src/frontends/onnx/frontend/src/ops_bridge.cpp
+++ b/src/frontends/onnx/frontend/src/ops_bridge.cpp
@@ -503,6 +503,8 @@ OperatorsBridge::OperatorsBridge() {
     REGISTER_OPERATOR("ReduceSum", 1, reduce_sum);
     REGISTER_OPERATOR("ReduceSum", 13, reduce_sum);
     REGISTER_OPERATOR("ReduceSumSquare", 1, reduce_sum_square);
+    REGISTER_OPERATOR("ReduceSumSquare", 13, reduce_sum_square);
+    REGISTER_OPERATOR("ReduceSumSquare", 18, reduce_sum_square);
     REGISTER_OPERATOR("Relu", 1, relu);
     REGISTER_OPERATOR("Reshape", 1, reshape);
     REGISTER_OPERATOR("Resize", 1, resize);
diff --git a/src/frontends/onnx/tests/models/reduce_sum_square_13.prototxt b/src/frontends/onnx/tests/models/reduce_sum_square_13.prototxt
new file mode 100644
index 00000000000000..590fa0c7414504
--- /dev/null
+++ b/src/frontends/onnx/tests/models/reduce_sum_square_13.prototxt
@@ -0,0 +1,48 @@
+ir_version: 3
+producer_name: "OpenVINO ONNX Frontend"
+graph {
+  node {
+    input: "A"
+    output: "B"
+    op_type: "ReduceSumSquare"
+  }
+  name: "compute_graph"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 13
+}
diff --git a/src/frontends/onnx/tests/models/reduce_sum_square_18.prototxt b/src/frontends/onnx/tests/models/reduce_sum_square_18.prototxt
new file mode 100644
index 00000000000000..4b9e0f944565eb
--- /dev/null
+++ b/src/frontends/onnx/tests/models/reduce_sum_square_18.prototxt
@@ -0,0 +1,48 @@
+ir_version: 3
+producer_name: "OpenVINO ONNX Frontend"
+graph {
+  node {
+    input: "A"
+    output: "B"
+    op_type: "ReduceSumSquare"
+  }
+  name: "compute_graph"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 18
+}
diff --git a/src/frontends/onnx/tests/onnx_import.in.cpp b/src/frontends/onnx/tests/onnx_import.in.cpp
index 3aa45042d6276b..59a53c0016eb1a 100644
--- a/src/frontends/onnx/tests/onnx_import.in.cpp
+++ b/src/frontends/onnx/tests/onnx_import.in.cpp
@@ -1257,6 +1257,38 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_model_reduce_sum_square) {
     test_case.run();
 }
 
+OPENVINO_TEST(${BACKEND_NAME}, onnx_model_reduce_sum_square_13) {
+    auto model = convert_model("reduce_sum_square_13.onnx");
+
+    // input data shape (1, 1, 4, 4)
+    Inputs inputs{
+        ov::test::NDArray<float, 4>({{{{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}}}).get_vector()};
+
+    // output data shape (1,)
+    auto expected_output = ov::test::NDArray<float, 4>({{{{16}}}}).get_vector();
+
+    auto test_case = ov::test::TestCase(model, s_device);
+    test_case.add_multiple_inputs(inputs);
+    test_case.add_expected_output(expected_output);
+    test_case.run();
+}
+
+OPENVINO_TEST(${BACKEND_NAME}, onnx_model_reduce_sum_square_18) {
+    auto model = convert_model("reduce_sum_square_18.onnx");
+
+    // input data shape (1, 1, 4, 4)
+    Inputs inputs{
+        ov::test::NDArray<float, 4>({{{{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}}}).get_vector()};
+
+    // output data shape (1,)
+    auto expected_output = ov::test::NDArray<float, 4>({{{{16}}}}).get_vector();
+
+    auto test_case = ov::test::TestCase(model, s_device);
+    test_case.add_multiple_inputs(inputs);
+    test_case.add_expected_output(expected_output);
+    test_case.run();
+}
+
 OPENVINO_TEST(${BACKEND_NAME}, onnx_model_reduce_sum_13_axes_as_constant) {
     auto model = convert_model("reduce_sum_13_axes_as_constant.onnx");
 
diff --git a/src/frontends/onnx/tests/tests_python/test_backend.py b/src/frontends/onnx/tests/tests_python/test_backend.py
index 75ae10fb9e55c4..91bd1701ca34b5 100644
--- a/src/frontends/onnx/tests/tests_python/test_backend.py
+++ b/src/frontends/onnx/tests/tests_python/test_backend.py
@@ -488,12 +488,6 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
         "OnnxBackendNodeModelTest.test_reduce_prod_keepdims_random_cpu",
         "OnnxBackendNodeModelTest.test_reduce_prod_negative_axes_keepdims_example_cpu",
         "OnnxBackendNodeModelTest.test_reduce_prod_negative_axes_keepdims_random_cpu",
-        "OnnxBackendNodeModelTest.test_reduce_sum_square_do_not_keepdims_example_cpu",
-        "OnnxBackendNodeModelTest.test_reduce_sum_square_do_not_keepdims_random_cpu",
-        "OnnxBackendNodeModelTest.test_reduce_sum_square_keepdims_example_cpu",
-        "OnnxBackendNodeModelTest.test_reduce_sum_square_keepdims_random_cpu",
-        "OnnxBackendNodeModelTest.test_reduce_sum_square_negative_axes_keepdims_example_cpu",
-        "OnnxBackendNodeModelTest.test_reduce_sum_square_negative_axes_keepdims_random_cpu",
     ),
     (
         xfail_issue_99969,
@@ -685,7 +679,6 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
         "OnnxBackendNodeModelTest.test_reduce_l1_empty_set_cpu",
         "OnnxBackendNodeModelTest.test_reduce_log_sum_exp_empty_set_cpu",
         "OnnxBackendNodeModelTest.test_reduce_prod_empty_set_cpu",
-        "OnnxBackendNodeModelTest.test_reduce_sum_square_empty_set_cpu",
     ),
     (
         skip_misalignment,
diff --git a/src/frontends/pytorch/src/op/expand.cpp b/src/frontends/pytorch/src/op/expand.cpp
index 5005e34ebb7384..8e9ce327e647d5 100644
--- a/src/frontends/pytorch/src/op/expand.cpp
+++ b/src/frontends/pytorch/src/op/expand.cpp
@@ -42,24 +42,30 @@ OutputVector translate_expand_as(const NodeContext& context) {
 };
 
 OutputVector translate_expand_fx(const NodeContext& context) {
-    // aten::expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
-    num_inputs_check(context, 2, 3);
+    auto num_inputs = context.get_input_size();
+    num_inputs_check(context, 2, num_inputs);
     auto x = context.get_input(0);
-    // TODO: This is a temporary solution to optimize out Broadcast if the input and
-    // output shapes are same. This should be removed after a proper optimization is
-    // implemented.
-    auto sizes_const = context.const_input<Shape>(1);
-    if (x.get_partial_shape().is_static() && x.get_shape() == sizes_const) {
-        return {x};
-    }
+    std::vector<int32_t> shape_vec;
     auto sizes = context.get_input(1);
-    // TODO: figure out what implicit means
-    PYTORCH_OP_CONVERSION_CHECK(context.input_is_none(2) || context.const_input<bool>(2) == false,
-                                "Unexpected value of implicit for expand operation");
+    if (num_inputs != 2) {
+        for (size_t i = 1; i < num_inputs; i++) {
+            auto a = context.get_input_from_visible_context(i).get_node_shared_ptr();
+            auto shape_input = context.get_input(static_cast<int>(i));
+            if (std::dynamic_pointer_cast<ov::op::v0::Parameter>(a) ||
+                shape_input.get_partial_shape().rank().is_dynamic() ||
+                shape_input.get_partial_shape().rank().get_length() == 0) {
+                shape_vec.push_back(-1);
+            } else {
+                auto val = context.const_input<int32_t>(i);
+                shape_vec.push_back(val);
+            }
+        }
+        sizes = ov::op::v0::Constant::create(element::i32, Shape{num_inputs - 1}, shape_vec);
+    }
     return base_expand(context, x, sizes);
 };
 
 }  // namespace op
 }  // namespace pytorch
 }  // namespace frontend
-}  // namespace ov
\ No newline at end of file
+}  // namespace ov
diff --git a/src/frontends/pytorch/src/op/reshape.cpp b/src/frontends/pytorch/src/op/reshape.cpp
index 97306ba1f6d4e7..edea4c7aefb44a 100644
--- a/src/frontends/pytorch/src/op/reshape.cpp
+++ b/src/frontends/pytorch/src/op/reshape.cpp
@@ -5,6 +5,9 @@
 #include "openvino/op/reshape.hpp"
 
 #include "openvino/frontend/pytorch/node_context.hpp"
+#include "openvino/op/concat.hpp"
+#include "openvino/op/squeeze.hpp"
+#include "openvino/op/unsqueeze.hpp"
 #include "utils.hpp"
 
 namespace ov {
@@ -22,6 +25,77 @@ OutputVector translate_reshape(const NodeContext& context) {
     return {context.mark_node(reshape)};
 };
 
+OutputVector translate_reshape_fx(const NodeContext& context) {
+    // Schema: aten.view.default(Tensor input, int[] shape) -> Tensor
+    auto num_inputs = context.get_input_size();
+    num_inputs_check(context, 2, num_inputs);
+    std::vector<int32_t> shape_vec;
+    if (context.get_input_type(1).is<type::List>()) {
+        int num_dyn_dims = 0;
+        for (size_t i = 1; i < num_inputs; i++) {
+            auto shape_input = context.get_input(static_cast<int>(i));
+            if (context.get_input_type(i).as<type::List>().element_type.is<type::PyScalar>()) {
+                auto const_val = context.const_input<int32_t>(i);
+                shape_vec.push_back(const_val);
+            } else {
+                // Set dimension to be dynamic if it's coming from an argument or another node
+                shape_vec.push_back(-1);
+                num_dyn_dims++;
+            }
+        }
+        // We cannot use multiple -1s if there are more than 1 dynamic dimensions
+        if (num_dyn_dims >= 2) {
+            auto inp_shape = context.get_input(0).get_partial_shape();
+            // If there are multiple dynamic dymensions, we cannot support inputs with dynamic rank
+            if (inp_shape.rank().is_static()) {
+                auto zero = context.mark_node(ov::op::v0::Constant::create(element::i32, Shape{1}, {0}));
+                if (inp_shape.size() >= 3 && inp_shape.size() + 1 == shape_vec.size() && shape_vec[0] == 1 &&
+                    inp_shape[0] == shape_vec[1]) {
+                    // [N, ...] -> [1, N, ...] Can be translated to Unsqueeze
+                    auto unsqueeze =
+                        context.mark_node(std::make_shared<ov::op::v0::Unsqueeze>(context.get_input(0), zero));
+                    return {unsqueeze};
+                } else if (shape_vec.size() >= 3 && shape_vec.size() + 1 == inp_shape.size() && inp_shape[0] == 1 &&
+                           inp_shape[1] == shape_vec[0]) {
+                    // [1, N, ...] -> [N, ...] Can be translated to Squeeze
+                    auto squeeze = context.mark_node(std::make_shared<ov::op::v0::Squeeze>(context.get_input(0), zero));
+                    return {squeeze};
+                } else if (inp_shape.size() == shape_vec.size()) {
+                    // If the input rank is equal to output rank, we can use 0s in place of dynamic dimensions
+                    for (size_t k = 0; k < shape_vec.size(); k++) {
+                        if (shape_vec[k] == -1)
+                            shape_vec[k] = 0;
+                    }
+                } else {
+                    FRONT_END_GENERAL_CHECK(
+                        false,
+                        "Cannot support reshape with multiple dynamic dimensions for unequal ranks");
+                }
+            } else {
+                FRONT_END_GENERAL_CHECK(
+                    false,
+                    "Cannot support reshape with multiple dynamic dimensions for dynamic input ranks");
+            }
+        }
+
+        auto shape_const = ov::op::v0::Constant::create(element::i32, Shape{num_inputs - 1}, shape_vec);
+        auto reshape = std::make_shared<ov::op::v1::Reshape>(context.get_input(0), shape_const, true);
+        return {context.mark_node(reshape)};
+    } else {
+        auto shape_input = context.get_input(1);
+        if (shape_input.get_partial_shape().rank().is_dynamic() ||
+            shape_input.get_partial_shape().rank().get_length() == 0) {
+            shape_vec.push_back(0);
+            auto shape_const = ov::op::v0::Constant::create(element::i32, Shape{1}, shape_vec);
+            auto result =
+                context.mark_node(std::make_shared<ov::op::v1::Reshape>(context.get_input(0), shape_const, true));
+            return {result};
+        }
+        auto reshape = std::make_shared<ov::op::v1::Reshape>(context.get_input(0), context.get_input(1), true);
+        return {context.mark_node(reshape)};
+    }
+};
+
 }  // namespace op
 }  // namespace pytorch
 }  // namespace frontend
diff --git a/src/frontends/pytorch/src/op/slice.cpp b/src/frontends/pytorch/src/op/slice.cpp
index e718183ae13d7e..6994979a1e77e9 100644
--- a/src/frontends/pytorch/src/op/slice.cpp
+++ b/src/frontends/pytorch/src/op/slice.cpp
@@ -18,7 +18,9 @@ namespace op {
 
 using namespace ov::op;
 
-OutputVector translate_slice_common(const NodeContext& context, const size_t num_inputs) {
+OutputVector translate_slice_common(const NodeContext& context,
+                                    const size_t num_inputs,
+                                    const bool stop_dynamic_rank_unsqueeze = true) {
     // aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> (t[])
     // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> (Tensor(a))
     ov::Output<ov::Node> dim;
@@ -56,7 +58,9 @@ OutputVector translate_slice_common(const NodeContext& context, const size_t num
     ov::Output<ov::Node> end;
     if (!context.input_is_none(end_idx)) {
         end = context.get_input(end_idx);
-        if (end.get_partial_shape().rank().is_dynamic() || end.get_partial_shape().rank().get_length() == 0) {
+        // TODO: Find a better way to solve the issue with dynamic ranks for "end"
+        if ((stop_dynamic_rank_unsqueeze && end.get_partial_shape().rank().is_dynamic()) ||
+            (!(end.get_partial_shape().rank().is_dynamic()) && end.get_partial_shape().rank().get_length() == 0)) {
             end = context.mark_node(std::make_shared<v0::Unsqueeze>(end, axis_0));
         }
     } else {
@@ -81,10 +85,10 @@ OutputVector translate_slice(const NodeContext& context) {
 OutputVector translate_slice_fx(const NodeContext& context) {
     // slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
     // FX version of slice have the inputs in the same order as it has 5 inputs, even if it has less than 5 inputs
-    return translate_slice_common(context, 5);
+    return translate_slice_common(context, 5, false);
 };
 
 }  // namespace op
 }  // namespace pytorch
 }  // namespace frontend
-}  // namespace ov
\ No newline at end of file
+}  // namespace ov
diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp
index af09480ea4282d..ef53c75d0fe369 100644
--- a/src/frontends/pytorch/src/op_table.cpp
+++ b/src/frontends/pytorch/src/op_table.cpp
@@ -284,6 +284,7 @@ OP_CONVERTER(translate_new_zeros_fx);
 OP_CONVERTER(translate_ones_fx);
 OP_CONVERTER(translate_ones_like_fx);
 OP_CONVERTER(translate_reflection_pad_nd_fx);
+OP_CONVERTER(translate_reshape_fx);
 OP_CONVERTER(translate_rsub_fx);
 OP_CONVERTER(translate_scalar_tensor_fx);
 OP_CONVERTER(translate_scaled_dot_product_attention_fx);
@@ -733,7 +734,11 @@ const std::map<std::string, CreatorFunction> get_supported_ops_ts() {
 
 const std::map<std::string, CreatorFunction> get_supported_ops_fx() {
     return {
+        {"<built-in function add>", op::translate_add},
+        {"<built-in function floordiv>", op::translate_floor_divide},
         {"<built-in function getitem>", op::translate_getitem},  // TODO: Check if there is any other way to handle this
+        {"<built-in function mul>", op::translate_mul},
+        {"<built-in function sub>", op::translate_sub},
         {"aten._adaptive_avg_pool1d.default", op::translate_adaptive_avg_pool1d},
         {"aten._adaptive_avg_pool2d.default", op::translate_adaptive_avg_pool2d},
         {"aten._adaptive_avg_pool3d.default", op::translate_adaptive_avg_pool3d},
@@ -949,6 +954,7 @@ const std::map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.sub.Tensor", op::translate_sub_fx},
         {"aten.sum.default", op::translate_sum_fx},
         {"aten.sum.dim_IntList", op::translate_sum_fx},
+        {"aten.sym_size.int", op::translate_size},
         {"aten.t.default", op::translate_t},
         {"aten.tan.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Tan>},
         {"aten.tanh.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Tanh>},
@@ -961,7 +967,7 @@ const std::map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.upsample_nearest2d.default", op::translate_upsample_nearest2d},
         {"aten.var.correction", op::translate_var_fx},
         {"aten.var_mean.correction", op::translate_var_mean_fx},
-        {"aten.view.default", op::translate_reshape},
+        {"aten.view.default", op::translate_reshape_fx},
         {"aten.where.self", op::translate_where},
         {"aten.zeros.default", op::translate_zeros_fx},
         {"aten.zeros.names", op::translate_zeros_fx},
diff --git a/src/inference/include/openvino/runtime/intel_gpu/properties.hpp b/src/inference/include/openvino/runtime/intel_gpu/properties.hpp
index 7f661d5b67a74a..185195e288805c 100644
--- a/src/inference/include/openvino/runtime/intel_gpu/properties.hpp
+++ b/src/inference/include/openvino/runtime/intel_gpu/properties.hpp
@@ -115,6 +115,14 @@ static constexpr Property<ov::hint::Priority> host_task_priority{"GPU_HOST_TASK_
  * @ingroup ov_runtime_ocl_gpu_prop_cpp_api
  */
 static constexpr Property<int64_t> available_device_mem{"AVAILABLE_DEVICE_MEM_SIZE"};
+
+/**
+ * @brief Turning on this key disables SDPA operation decomposition and keeps SDPA operation in the graph.
+ * Enabling SDPA optimization may provide performance improvements and memory usage reduction.
+ * This key serves as a recommendation and may be ignored in known sub-optimal cases.
+ * @ingroup ov_runtime_ocl_gpu_prop_cpp_api
+ */
+static constexpr Property<bool> enable_sdpa_optimization{"GPU_ENABLE_SDPA_OPTIMIZATION"};
 }  // namespace hint
 
 /**
diff --git a/src/plugins/auto/src/auto_schedule.cpp b/src/plugins/auto/src/auto_schedule.cpp
index 0c248a2b491054..3de0cc9f00b871 100644
--- a/src/plugins/auto/src/auto_schedule.cpp
+++ b/src/plugins/auto/src/auto_schedule.cpp
@@ -133,7 +133,6 @@ void AutoSchedule::init() {
     if (m_compile_context[ACTUALDEVICE].m_is_enabled) {
         LOG_INFO_TAG("select device:%s", m_compile_context[ACTUALDEVICE].m_device_info.device_name.c_str());
         bool is_actual_cpu = m_compile_context[ACTUALDEVICE].m_device_info.device_name.find("CPU") != std::string::npos;
-        bool is_actual_gpu = m_compile_context[ACTUALDEVICE].m_device_info.device_name.find("GPU") != std::string::npos;
         // if Actual device is CPU or perf_hint is cumulative, disabled m_compile_context[CPU], only use
         // m_compile_context[ACTUALDEVICE]
         if (is_actual_cpu || !m_context->m_startup_fallback) {
@@ -148,29 +147,11 @@ void AutoSchedule::init() {
                     // limit the threads num for compiling
                     auto device = m_compile_context[ACTUALDEVICE].m_device_info.device_name;
                     auto& device_config = m_compile_context[ACTUALDEVICE].m_device_info.config;
-                    if (is_actual_gpu) {
-                        int max_threads = 0;
-                        try {
-                            max_threads = m_context->m_ov_core->get_property(device, ov::compilation_num_threads);
-                        } catch (const ov::Exception&) {
-                            LOG_DEBUG_TAG("cannot get MAX_NUM_THREADS from GPU");
-                        }
-                        if (max_threads == static_cast<int>(std::thread::hardware_concurrency())) {
-                            int thread_num = max_threads / 2;
-                            m_compile_context[ACTUALDEVICE].m_device_info.config.insert(
-                                ov::compilation_num_threads(thread_num));
-                            LOG_DEBUG_TAG("gpu streams number for compiling: %d", thread_num);
-                        } else {
-                            // user set the compiling threads num
-                            // use the user's val anyway
-                            LOG_DEBUG_TAG("user defined compiling threads: %d", max_threads);
-                        }
-                    }
                     std::string cache_dir = device_config.count(ov::cache_dir.name())
                                                 ? device_config[ov::cache_dir.name()].as<std::string>()
                                                 : m_context->m_ov_core->get_property("", ov::cache_dir);
 
-                    if (!m_context->m_is_set_startup_fallback && !cache_dir.empty()) {
+                    if (m_context->m_startup_fallback && !cache_dir.empty()) {
                         const auto properties =
                             m_context->m_ov_core->create_compile_config(ov::DeviceIDParser(device).get_device_name(),
                                                                         device_config);
@@ -323,15 +304,20 @@ void AutoSchedule::try_to_compile_model(AutoCompileContext& context, const std::
              device_config.find(ov::compilation_num_threads.name()) != device_config.end());
         if (cur_dev_is_gpu && m_compile_context[CPU].m_is_enabled && !is_already_set_gpu) {
             device_config.insert(ov::intel_gpu::hint::host_task_priority(ov::hint::Priority::HIGH));
-            auto proc_type_table = get_org_proc_type_table();
-            int compilation_num_threads = proc_type_table[0][MAIN_CORE_PROC] != 0
-                                              ? proc_type_table[0][MAIN_CORE_PROC]
-                                              : proc_type_table[0][EFFICIENT_CORE_PROC];
-            if (device_config.insert(ov::compilation_num_threads(compilation_num_threads)).second)
-                LOG_DEBUG_TAG("gpu streams number for compiling: %d", compilation_num_threads);
-            else
-                LOG_DEBUG_TAG("user defined compiling threads: %d",
-                              device_config[ov::compilation_num_threads.name()].as<int32_t>());
+            int max_threads = 0;
+            try {
+                m_context->m_ov_core->get_property(device, ov::compilation_num_threads);
+                auto proc_type_table = get_org_proc_type_table();
+                max_threads = proc_type_table[0][MAIN_CORE_PROC] != 0 ? proc_type_table[0][MAIN_CORE_PROC]
+                                                                      : proc_type_table[0][EFFICIENT_CORE_PROC];
+                if (device_config.insert(ov::compilation_num_threads(max_threads)).second)
+                    LOG_DEBUG_TAG("gpu streams number for compiling: %d", max_threads);
+                else
+                    LOG_DEBUG_TAG("user defined compiling threads: %d",
+                                  device_config[ov::compilation_num_threads.name()].as<int32_t>());
+            } catch (const ov::Exception&) {
+                LOG_DEBUG_TAG("cannot get MAX_NUM_THREADS from GPU");
+            }
         }
     }
     try {
diff --git a/src/plugins/auto/src/common.hpp b/src/plugins/auto/src/common.hpp
index 63fb8753e4fff2..28567eb23392c4 100644
--- a/src/plugins/auto/src/common.hpp
+++ b/src/plugins/auto/src/common.hpp
@@ -207,7 +207,6 @@ class ScheduleContext : public std::enable_shared_from_this<ScheduleContext>  {
     bool                                           m_need_perf_counters;
     bool                                           m_batching_disabled = false;
     bool                                           m_startup_fallback = true;
-    bool                                           m_is_set_startup_fallback = false;
     bool                                           m_runtime_fallback = true;
     bool                                           m_bind_buffer = false;
     std::shared_ptr<ov::Model>                     m_model;
diff --git a/src/plugins/auto/src/plugin.cpp b/src/plugins/auto/src/plugin.cpp
index 9d8174252d21c9..06b3b7dbc947e4 100644
--- a/src/plugins/auto/src/plugin.cpp
+++ b/src/plugins/auto/src/plugin.cpp
@@ -436,7 +436,6 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model_impl(const std::string
     OPENVINO_ASSERT(auto_s_context->m_ov_core);
     auto_s_context->m_log_tag = get_device_name();
     auto_s_context->m_model_precision = model_precision;
-    auto_s_context->m_is_set_startup_fallback = load_config.is_set_by_user(ov::intel_auto::enable_startup_fallback);
     auto_s_context->m_startup_fallback = load_config.get_property(ov::intel_auto::enable_startup_fallback);
     auto_s_context->m_runtime_fallback = load_config.get_property(ov::intel_auto::enable_runtime_fallback);
     auto_s_context->m_bind_buffer = load_config.get_property(ov::intel_auto::device_bind_buffer);
diff --git a/src/plugins/auto/tests/functional/behavior/caching_test.cpp b/src/plugins/auto/tests/functional/behavior/caching_test.cpp
index 1b606470fa2f53..1b2df23f9c0d1c 100644
--- a/src/plugins/auto/tests/functional/behavior/caching_test.cpp
+++ b/src/plugins/auto/tests/functional/behavior/caching_test.cpp
@@ -32,7 +32,7 @@ TEST_F(AutoFuncTests, compiled_with_cache_enabled) {
     core.set_property(ov::cache_dir(""));
 }
 
-TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating) {
+TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating_default_startup_fallback) {
     core.set_property(ov::cache_dir(cache_path));
     core.set_property("MOCK_GPU", ov::device::id("test"));  // device id for cache property distinguish with MOCK_CPU
     {
@@ -74,6 +74,49 @@ TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_acceler
     core.set_property(ov::cache_dir(""));
 }
 
+TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating_set_startup_fallback) {
+    core.set_property(ov::cache_dir(cache_path));
+    core.set_property("MOCK_GPU", ov::device::id("test"));  // device id for cache property distinguish with MOCK_CPU
+    {
+        auto compiled_model = core.compile_model(model_cannot_batch,
+                                                 "AUTO",
+                                                 {ov::device::priorities("MOCK_GPU", "MOCK_CPU"),
+                                                  ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)});
+    }
+    // No cached model for actual device
+    // will cache model for both actual device and CPU plugin
+    ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 2);
+    ov::test::utils::removeFilesWithExt(cache_path, "blob");
+    {
+        auto compiled_model = core.compile_model(
+            model_cannot_batch,
+            "AUTO",
+            {ov::device::priorities("MOCK_GPU"), ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)});
+    }
+    {
+        auto compiled_model = core.compile_model(model_cannot_batch,
+                                                 "AUTO",
+                                                 {ov::device::priorities("MOCK_GPU", "MOCK_CPU"),
+                                                  ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                  ov::intel_auto::enable_startup_fallback(true)});
+    }
+    // cached model exists for actual device
+    // will reuse cached model for actual device without CPU accelerating(No cached model for CPU)
+    ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 1);
+    core.set_property("MOCK_GPU", ov::device::id("test_regenerate"));
+    {
+        auto compiled_model = core.compile_model(model_cannot_batch,
+                                                 "AUTO",
+                                                 {ov::device::priorities("MOCK_GPU", "MOCK_CPU"),
+                                                  ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                  ov::intel_auto::enable_startup_fallback(false)});
+    }
+    // model hash id changed for actual device
+    // will cache 2 models for actual device and no cached model for CPU
+    ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 2);
+    core.set_property(ov::cache_dir(""));
+}
+
 TEST_F(AutoFuncTests, compiled_with_cache_enabled_batch_enabled) {
 #ifdef ENABLE_AUTO_BATCH
     core.set_property(ov::cache_dir(cache_path));
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp
index f7afc9641bbdce..bd05801c139dc8 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp
@@ -82,6 +82,11 @@ static constexpr size_t vec_len_f32_avx2 = vec_len_avx2 / sizeof(float);
         auto vec_f16 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a));
         return _mm512_cvtph_ps(vec_f16);
     }
+    inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) {
+        auto mask = (1 << count) - 1;
+        auto f16_vec = _mm256_maskz_loadu_epi16(mask, a);
+        return _mm512_cvtph_ps(f16_vec);
+    }
     inline void mm512_uni_storeu_ps(ov::float16* addr,  __m512 v) {
         __m256i vec_f16 = _mm512_cvtps_ph(v, 0);
         _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), vec_f16);
@@ -149,6 +154,11 @@ static constexpr size_t vec_len_f32_avx2 = vec_len_avx2 / sizeof(float);
         auto o = _mm256_cvtph_ps(vec_f16);
         return o;
     }
+    inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) {
+        ov::float16 tmp_values[8] = {0};
+        std::memcpy(tmp_values, a, count * sizeof(ov::float16));
+        return mm256_uni_loadu_ps(tmp_values);
+    }
     inline void mm256_uni_storeu_ps(ov::float16* a,  __m256 v) {
         __m128i vec_f16 = _mm256_cvtps_ph(v, 0);
         _mm_storeu_si128(reinterpret_cast<__m128i *>(a), vec_f16);
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
index cd46be617465b4..d07f7490f1bbed 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
@@ -599,10 +599,11 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str
     }
 }
 
-// N and K must be multiple of 16
+// N must be multiple of 16
 template<typename TDST, typename TSRC>
-void transpose_16Nx16K(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
-    for (size_t k = 0; k < K; k += 16) {
+void transpose_16NxK(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+    size_t k = 0;
+    for (; k + 16 <= K; k += 16) {
         for (size_t n = 0; n < N; n += 16) {
             transpose_16x16_kernel(dst + n, src + n * src_stride, dst_stride, src_stride);
         }
@@ -610,19 +611,24 @@ void transpose_16Nx16K(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size
         dst += 16 * dst_stride;
         src += 16;
     }
+    if (k < K) {
+        for (size_t n = 0; n < N; n += 16) {
+            transpose_16xK_kernel(dst + n, src + n * src_stride, K - k, dst_stride, src_stride);
+        }
+    }
 }
 
 #if defined(HAVE_AVX512F)
-static void transpose_16Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+static void transpose_16NxK(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     // will treat as uint32_t transpose
     auto s = reinterpret_cast<uint32_t*>(src);
     auto d = reinterpret_cast<uint32_t*>(dst);
-    transpose_16Nx16K(d, s, reinterpret_cast<uint32_t*>(0), N, K >> 1, dst_stride, src_stride >> 1);
+    transpose_16NxK(d, s, reinterpret_cast<uint32_t*>(0), N, K >> 1, dst_stride, src_stride >> 1);
 }
 #endif
 
 template<typename TDST>
-void transpose_16Nx16K(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
+void transpose_16NxK(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     // The layout for per token per head:
     // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)|
     // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float)
@@ -634,7 +640,7 @@ void transpose_16Nx16K(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, s
         s += src_stride + 2 * sizeof(float);
         t += src_stride;
     }
-    transpose_16Nx16K(dst, tmp, reinterpret_cast<TDST*>(0), N, K, dst_stride, src_stride);
+    transpose_16NxK(dst, tmp, reinterpret_cast<TDST*>(0), N, K, dst_stride, src_stride);
 }
 
 // dequant f16/u8 to float
@@ -664,55 +670,55 @@ void dequant(TDST* dst, uint8_t* src, size_t N, size_t K) {
 
 #if defined(HAVE_AVX512F)
 // pack bf16/u8 to bf16
-static void pack_32x32_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t stride) {
+static void pack_32x32_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_stride, size_t src_stride) {
     static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
     auto midx = _mm512_loadu_si512(idx);
     for (size_t i = 0; i < 16; i++) {
         auto a = _mm512_loadu_si512(src);               // [a1  a2  a3 a4 | a5  a6  a7 a8]   total 512-bits in 8 64bits unit
-        auto b = _mm512_loadu_si512(src + stride);      // [b1  b2  b3 b4 | b5  b6  b7 b8]   total 512-bits
+        auto b = _mm512_loadu_si512(src + src_stride);  // [b1  b2  b3 b4 | b5  b6  b7 b8]   total 512-bits
         a = _mm512_permutexvar_epi64(midx, a);          // [a1 a5 | a2 a6 | a3 a7 | a4 a8]
         b = _mm512_permutexvar_epi64(midx, b);          // [b1 b5 | b2 b6 | b3 b7 | b4 b8]
         auto B0 = _mm512_unpacklo_epi16(a, b);          // [ a1&b1  a2&b2   a3&b3   a4&b4] for each 128-bits lane, interleave word in low 64 bits
         auto B1 = _mm512_unpackhi_epi16(a, b);          // [ a5&b5  a6&b6   a7&b7   a8&b8] for each 128-bits lane, interleave word in high 64 bits
         _mm512_storeu_si512(dst, B0);
         _mm512_storeu_si512(dst + 32, B1);
-        src += 2 * stride;
-        dst += 2 * stride;
+        src += 2 * src_stride;
+        dst += 2 * dst_stride;
     }
 }
 
-static void pack_32x16_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t stride) {
+static void pack_32x16_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_stride, size_t src_stride) {
     static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
     auto midx = _mm512_loadu_si512(idx);
     for (size_t i = 0; i < 16; i++) {
         auto x = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src));               // [a1  a2  a3 a4]   total 256-bits in 4 64bits unit
-        auto y = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src + stride));      // [b1  b2  b3 b4]   total 256-bits
+        auto y = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src + src_stride));  // [b1  b2  b3 b4]   total 256-bits
         auto a = _mm512_castsi256_si512(x);
         auto b = _mm512_castsi256_si512(y);
         a = _mm512_permutexvar_epi64(midx, a);                                      // [a1 x | a2 x | a3 x | a4 x]
         b = _mm512_permutexvar_epi64(midx, b);                                      // [b1 x | b2 x | b3 x | b4 x]
         auto B0 = _mm512_unpacklo_epi16(a, b);
         _mm512_storeu_si512(dst, B0);
-        src += 2 * stride;
-        dst += 2 * stride;
+        src += 2 * src_stride;
+        dst += 2 * dst_stride;
     }
 }
 
-static void pack_32Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t stride) {
+static void pack_32Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     for (size_t n = 0; n < N; n += 32) {
         size_t k = 0;
         for (; k + 32 <= K; k += 32) {
-            pack_32x32_kernel(dst + k * 2, src + k, stride);
+            pack_32x32_kernel(dst + k * 2, src + k, dst_stride, src_stride);
         }
         if (k < K)
-            pack_32x16_kernel(dst + k * 2, src + k, stride);
+            pack_32x16_kernel(dst + k * 2, src + k, dst_stride, src_stride);
 
-        dst += 32 * stride;
-        src += 32 * stride;
+        dst += 32 * dst_stride;
+        src += 32 * src_stride;
     }
 }
 
-static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, size_t N, size_t K, size_t stride) {
+static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     // The layout for per token per head:
     // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)|
     // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float)
@@ -721,15 +727,15 @@ static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, siz
     for (size_t n = 0; n < N; n ++) {
         auto f = reinterpret_cast<float*>(s);
         attn_dequant_u8_kernel(s + 2 * sizeof(float), t, K, f[0], f[1]);
-        s += stride + 2 * sizeof(float);
-        t += stride;
+        s += src_stride + 2 * sizeof(float);
+        t += src_stride;
     }
-    pack_32Nx16K(dst, tmp, reinterpret_cast<ov::bfloat16*>(0), N, K, stride);
+    pack_32Nx16K(dst, tmp, reinterpret_cast<ov::bfloat16*>(0), N, K, dst_stride, src_stride);
 }
 #endif
 
 template<typename T>
-static void pack_32Nx16K(float* dst, T* src, float* tmp, size_t N, size_t K, size_t stride) {
+static void pack_32Nx16K(float* dst, T* src, float* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) {
     // never called
     OPENVINO_THROW("pack_32Nx16K: should not be called.");
 }
@@ -858,7 +864,7 @@ struct MHAHelper {
 
     void init_reorder_buffers(size_t batch, size_t kv_len_in_blocks) {
         _qk_scratch_b.resize<DATA_TYPE>({batch, kv_len_in_blocks, _Hk, _block_size * _S});
-        _wv_scratch_b.resize<DATA_TYPE>({batch, kv_len_in_blocks, _Hk, _block_size * _S});
+        _wv_scratch_b.resize<DATA_TYPE>({batch, kv_len_in_blocks, _Hk, _block_size * rnd_up(_S, _block_size)});
     }
 
     // compute one block(such as 32 tokens) of query in M dimension: softmax(q_block*k')*v
@@ -1307,7 +1313,7 @@ struct MHA {
             auto ithr = parallel_get_thread_num();
             auto* k_ptr = k_cache.ptr<KVCACHE_TYPE>(block_number, hk);
             auto* v_ptr = v_cache.ptr<KVCACHE_TYPE>(block_number, hk);
-            transpose_16Nx16K(_helper._qk_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
+            transpose_16NxK(_helper._qk_scratch_b.template ptr<DATA_TYPE>(batch_in_reorder, kv_block, hk),
                 k_ptr,
                 _helper._output.template ptr<DATA_TYPE>(ithr),
                 _helper._block_size,
@@ -1318,6 +1324,7 @@ struct MHA {
                     _helper._output.template ptr<DATA_TYPE>(ithr),
                     _helper._block_size,
                     _helper._S,
+                    rnd_up(_helper._S, _helper._block_size),
                     _helper._S);
             } else {
                 // need to decompress
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp
index b39028792ee547..b719246e4976a1 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp
@@ -133,6 +133,50 @@ inline void transpose_16x16_kernel(float* _dst, T* src, size_t dst_stride, size_
     _mm512_storeu_si512(dst + 15 * dst_stride, rf);
 }
 
+template<typename T>
+inline void transpose_16xK_kernel(float* _dst, T* src, size_t K, size_t dst_stride, size_t src_stride) {
+    auto* dst = reinterpret_cast<uint32_t*>(_dst);
+    __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf;
+    r0 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src, K));
+    r1 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + src_stride, K));
+    r2 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 2 * src_stride, K));
+    r3 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 3 * src_stride, K));
+    r4 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 4 * src_stride, K));
+    r5 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 5 * src_stride, K));
+    r6 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 6 * src_stride, K));
+    r7 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 7 * src_stride, K));
+    r8 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 8 * src_stride, K));
+    r9 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 9 * src_stride, K));
+    ra = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 10 * src_stride, K));
+    rb = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 11 * src_stride, K));
+    rc = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 12 * src_stride, K));
+    rd = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 13 * src_stride, K));
+    re = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 14 * src_stride, K));
+    rf = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 15 * src_stride, K));
+
+    transpose_m512i_16x16(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf);
+
+#define S(m) _mm512_storeu_si512(dst + 0x##m * dst_stride, r##m)
+#define S8() S(0); S(1); S(2); S(3); S(4); S(5); S(6); S(7);
+    switch (K) {
+        case 8: S8(); break;
+        case 9: S8() S(8); break;
+        case 10: S8(); S(8); S(9); break;
+        case 11: S8(); S(8); S(9); S(a); break;
+        case 12: S8(); S(8); S(9); S(a); S(b); break;
+        case 13: S8(); S(8); S(9); S(a); S(b); S(c); break;
+        case 14: S8(); S(8); S(9); S(a); S(b); S(c); S(d); break;
+        case 15: S8(); S(8); S(9); S(a); S(b); S(c); S(d); S(e); break;
+        case 1: S(0); break;
+        case 2: S(0); S(1); break;
+        case 3: S(0); S(1); S(2); break;
+        case 4: S(0); S(1); S(2); S(3); break;
+        case 5: S(0); S(1); S(2); S(3); S(4); break;
+        case 6: S(0); S(1); S(2); S(3); S(4); S(5); break;
+        case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break;
+    }
+}
+
 inline void transpose_16x16_kernel(uint32_t* dst, uint32_t* src, size_t dst_stride, size_t src_stride) {
     __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf;
     r0 = _mm512_loadu_si512(src);
@@ -172,6 +216,50 @@ inline void transpose_16x16_kernel(uint32_t* dst, uint32_t* src, size_t dst_stri
     _mm512_storeu_si512(dst + 15 * dst_stride, rf);
 }
 
+inline void transpose_16xK_kernel(uint32_t* dst, uint32_t* src, size_t K, size_t dst_stride, size_t src_stride) {
+    __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf;
+    __mmask16 k = 0xffff >> (16 - K);
+
+    r0 = _mm512_maskz_loadu_epi32(k, src);
+    r1 = _mm512_maskz_loadu_epi32(k, src + src_stride);
+    r2 = _mm512_maskz_loadu_epi32(k, src + 2 * src_stride);
+    r3 = _mm512_maskz_loadu_epi32(k, src + 3 * src_stride);
+    r4 = _mm512_maskz_loadu_epi32(k, src + 4 * src_stride);
+    r5 = _mm512_maskz_loadu_epi32(k, src + 5 * src_stride);
+    r6 = _mm512_maskz_loadu_epi32(k, src + 6 * src_stride);
+    r7 = _mm512_maskz_loadu_epi32(k, src + 7 * src_stride);
+    r8 = _mm512_maskz_loadu_epi32(k, src + 8 * src_stride);
+    r9 = _mm512_maskz_loadu_epi32(k, src + 9 * src_stride);
+    ra = _mm512_maskz_loadu_epi32(k, src + 10 * src_stride);
+    rb = _mm512_maskz_loadu_epi32(k, src + 11 * src_stride);
+    rc = _mm512_maskz_loadu_epi32(k, src + 12 * src_stride);
+    rd = _mm512_maskz_loadu_epi32(k, src + 13 * src_stride);
+    re = _mm512_maskz_loadu_epi32(k, src + 14 * src_stride);
+    rf = _mm512_maskz_loadu_epi32(k, src + 15 * src_stride);
+
+    transpose_m512i_16x16(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf);
+
+    switch (K) {
+        case 8: S8(); break;
+        case 9: S8() S(8); break;
+        case 10: S8(); S(8); S(9); break;
+        case 11: S8(); S(8); S(9); S(a); break;
+        case 12: S8(); S(8); S(9); S(a); S(b); break;
+        case 13: S8(); S(8); S(9); S(a); S(b); S(c); break;
+        case 14: S8(); S(8); S(9); S(a); S(b); S(c); S(d); break;
+        case 15: S8(); S(8); S(9); S(a); S(b); S(c); S(d); S(e); break;
+        case 1: S(0); break;
+        case 2: S(0); S(1); break;
+        case 3: S(0); S(1); S(2); break;
+        case 4: S(0); S(1); S(2); S(3); break;
+        case 5: S(0); S(1); S(2); S(3); S(4); break;
+        case 6: S(0); S(1); S(2); S(3); S(4); S(5); break;
+        case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break;
+    }
+#undef S
+#undef S8
+}
+
 #elif defined(HAVE_AVX2)
 
 // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
@@ -235,6 +323,64 @@ inline void transpose_16x16_kernel(float* dst, T* src, size_t dst_stride, size_t
     }
 }
 
+template<typename T>
+inline void transpose_16xK_kernel(float* dst, T* src, size_t K, size_t dst_stride, size_t src_stride) {
+    __m256 r0, r1, r2, r3, r4, r5, r6, r7;
+
+    if (K >= 8) {
+        for (int j = 0; j < 16; j += 8) {
+            r0 = mm256_uni_loadu_ps(src + src_stride * j);
+            r1 = mm256_uni_loadu_ps(src + src_stride * (1 + j));
+            r2 = mm256_uni_loadu_ps(src + src_stride * (2 + j));
+            r3 = mm256_uni_loadu_ps(src + src_stride * (3 + j));
+            r4 = mm256_uni_loadu_ps(src + src_stride * (4 + j));
+            r5 = mm256_uni_loadu_ps(src + src_stride * (5 + j));
+            r6 = mm256_uni_loadu_ps(src + src_stride * (6 + j));
+            r7 = mm256_uni_loadu_ps(src + src_stride * (7 + j));
+
+            transpose_8x8(r0, r1, r2, r3, r4, r5, r6, r7);
+
+            _mm256_storeu_ps(dst + j, r0);
+            _mm256_storeu_ps(dst + j + dst_stride, r1);
+            _mm256_storeu_ps(dst + j + dst_stride * 2, r2);
+            _mm256_storeu_ps(dst + j + dst_stride * 3, r3);
+            _mm256_storeu_ps(dst + j + dst_stride * 4, r4);
+            _mm256_storeu_ps(dst + j + dst_stride * 5, r5);
+            _mm256_storeu_ps(dst + j + dst_stride * 6, r6);
+            _mm256_storeu_ps(dst + j + dst_stride * 7, r7);
+        }
+        src += 8;
+        dst += 8 * dst_stride;
+        K -= 8;
+    }
+    if (K > 0) {
+        for (int j = 0; j < 16; j += 8) {
+            r0 = mm256_uni_loadu_tail_ps(src + src_stride * j, K);
+            r1 = mm256_uni_loadu_tail_ps(src + src_stride * (1 + j), K);
+            r2 = mm256_uni_loadu_tail_ps(src + src_stride * (2 + j), K);
+            r3 = mm256_uni_loadu_tail_ps(src + src_stride * (3 + j), K);
+            r4 = mm256_uni_loadu_tail_ps(src + src_stride * (4 + j), K);
+            r5 = mm256_uni_loadu_tail_ps(src + src_stride * (5 + j), K);
+            r6 = mm256_uni_loadu_tail_ps(src + src_stride * (6 + j), K);
+            r7 = mm256_uni_loadu_tail_ps(src + src_stride * (7 + j), K);
+
+            transpose_8x8(r0, r1, r2, r3, r4, r5, r6, r7);
+
+#define S(m) _mm256_storeu_ps(dst + j + m * dst_stride, r##m)
+            switch (K) {
+                case 1: S(0); break;
+                case 2: S(0); S(1); break;
+                case 3: S(0); S(1); S(2); break;
+                case 4: S(0); S(1); S(2); S(3); break;
+                case 5: S(0); S(1); S(2); S(3); S(4); break;
+                case 6: S(0); S(1); S(2); S(3); S(4); S(5); break;
+                case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break;
+            }
+#undef S
+        }
+    }
+}
+
 #else
 
 template<typename TSRC, typename TDST>
@@ -246,6 +392,15 @@ inline void transpose_16x16_kernel(TDST* dst, TSRC* src, size_t dst_stride, size
     }
 }
 
+template<typename TSRC, typename TDST>
+inline void transpose_16xK_kernel(TDST* dst, TSRC* src, size_t K, size_t dst_stride, size_t src_stride) {
+    for (size_t i = 0; i < K; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            dst[i * dst_stride + j] = static_cast<TDST>(src[i + j * src_stride]);
+        }
+    }
+}
+
 #endif
 
 }  // namespace XARCH
diff --git a/src/plugins/intel_cpu/src/nodes/roi_align_rotated.cpp b/src/plugins/intel_cpu/src/nodes/roi_align_rotated.cpp
index 2ce6f78e234389..e75bef1213d2cf 100644
--- a/src/plugins/intel_cpu/src/nodes/roi_align_rotated.cpp
+++ b/src/plugins/intel_cpu/src/nodes/roi_align_rotated.cpp
@@ -4,7 +4,7 @@
 
 #include "roi_align_rotated.h"
 
-#include <openvino/opsets/opset14.hpp>
+#include <openvino/opsets/opset15.hpp>
 
 #include "common/cpu_convert.h"
 #include "openvino/reference/roi_align.hpp"
@@ -15,7 +15,7 @@ namespace node {
 
 ROIAlignRotated::ROIAlignRotated(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr context)
     : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) {
-    const auto roiAlign = ov::as_type_ptr<const ov::opset14::ROIAlignRotated>(op);
+    const auto roiAlign = ov::as_type_ptr<const ov::opset15::ROIAlignRotated>(op);
     pooledH = roiAlign->get_pooled_h();
     pooledW = roiAlign->get_pooled_w();
     spatialScale = roiAlign->get_spatial_scale();
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index c230d468cd7922..85e8c2e10615b7 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -303,8 +303,6 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*(nightly|smoke)_MM_Brgemm_Static/MatMulLayerCPUTest.CompareWithRefs/MatMul_IS=\[\]_\[\]_TS=\(\(55.12\)\)_\(\(12.55\)\)_.*config=\(INFERENCE_PRECISION_HINT=bf16_\)_Fused=Multiply\(PerChannel\)_primitive=brgemm_avx512.*)",
         R"(.*smoke_MM_Brgemm_Dynamic_Fusing/MatMulLayerCPUTest.CompareWithRefs/MatMul_IS=\[\?.\?\]_\[\?.33\]_TS=\(\(16.12\)_\(33.7\)_\(16.12\)\)_\(\(12.33\)_\(7.33\)_\(12.33\)\)_transpose_a=0_transpose_b=0_secondaryInputType=PARAMETER_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPUconfig=\(INFERENCE_PRECISION_HINT=bf16_\)_Fused=Multiply\(PerChannel\)_primitive=brgemm_avx512.*)",
         // Issue: 140389
-        R"(.*smoke_GatherCompressedWeights_basic/GatherWeightsDecompression.CompareWithRefs.*i4.*)",
-        R"(.*smoke_MatmulAndGatherSharedWeightsDecompression.*weights_precision=i4.*decompression_subtract=1.*)",
         R"(.*FQLayerDQBias.smoke_CompareWithRefs.*)",
         R"(.*smoke_matmulBrgemmInt8/MatmulBrgemmInt8Test.CompareWithRefs.*MatMul.*InputType=i8_OutputType=i8.*)",
         R"(.*smoke_Snippets_MHAWOTransposeOnInputs_4D/MHAWOTransposeOnInputs.CompareWithRefImpl.*)",
diff --git a/src/plugins/intel_cpu/tools/dump_check/dump_check.py b/src/plugins/intel_cpu/tools/dump_check/dump_check.py
index f2426e3e1333f6..0ac1cd41da864e 100644
--- a/src/plugins/intel_cpu/tools/dump_check/dump_check.py
+++ b/src/plugins/intel_cpu/tools/dump_check/dump_check.py
@@ -75,14 +75,14 @@ def fill_tensors_from_image(input, input_file):
 
 class IEB:
     precision_table = {
-        10:(np.float32, 4),
-        12:(np.int16, 2),
-        40:(np.uint8, 1),
-        50:(np.int8, 1),
-        70:(np.int32, 4),
-        74:(np.uint32, 4),
-        72:(np.int64, 8),
-        73:(np.uint64, 8)
+        5:(np.float32, 4),
+        9:(np.int16, 2),
+        14:(np.uint8, 1),
+        8:(np.int8, 1),
+        10:(np.int32, 4),
+        15:(np.uint32, 4),
+        11:(np.int64, 8),
+        17:(np.uint64, 8)
     }
 
     @classmethod
diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt
index 2f3d9127dde7e0..dc24e404c74b81 100644
--- a/src/plugins/intel_gpu/CMakeLists.txt
+++ b/src/plugins/intel_gpu/CMakeLists.txt
@@ -76,8 +76,8 @@ set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_REL
 
 # Workaround to avoid warnings during LTO build
 if(CMAKE_COMPILER_IS_GNUCXX)
-  set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized"
-                                                  LINK_FLAGS_RELWITHDEBINFO "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized")
+  set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized -Wno-stringop-overflow"
+                                                  LINK_FLAGS_RELWITHDEBINFO "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized -Wno-stringop-overflow")
 endif()
 
 if(ENABLE_TESTS)
diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp
new file mode 100644
index 00000000000000..45416b4e53810b
--- /dev/null
+++ b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp
@@ -0,0 +1,94 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/node.hpp"
+#include "openvino/core/partial_shape.hpp"
+#include "openvino/op/op.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+
+namespace ov {
+namespace intel_gpu {
+namespace op {
+
+class SDPA : public ov::op::v13::ScaledDotProductAttention {
+public:
+    OPENVINO_OP("SDPA", "gpu_opset");
+
+    SDPA() = default;
+
+    SDPA(const ov::Output<Node>& Q,
+         const ov::Output<Node>& K,
+         const ov::Output<Node>& V,
+         const std::vector<int64_t>& order_q,
+         const std::vector<int64_t>& order_k,
+         const std::vector<int64_t>& order_v,
+         const std::vector<int64_t>& order_out,
+         const bool is_causal,
+         const ov::element::Type output_type = ov::element::undefined);
+
+    SDPA(const ov::Output<Node>& Q,
+         const ov::Output<Node>& K,
+         const ov::Output<Node>& V,
+         const ov::Output<Node>& attn_mask,
+         const std::vector<int64_t>& order_q,
+         const std::vector<int64_t>& order_k,
+         const std::vector<int64_t>& order_v,
+         const std::vector<int64_t>& order_out,
+         const bool is_causal,
+         const ov::element::Type output_type = ov::element::undefined);
+
+    SDPA(const ov::Output<Node>& Q,
+         const ov::Output<Node>& K,
+         const ov::Output<Node>& V,
+         const ov::Output<Node>& attn_mask,
+         const ov::Output<Node>& scale,
+         const std::vector<int64_t>& order_q,
+         const std::vector<int64_t>& order_k,
+         const std::vector<int64_t>& order_v,
+         const std::vector<int64_t>& order_out,
+         const bool is_causal,
+         const ov::element::Type output_type = ov::element::undefined);
+
+    bool visit_attributes(ov::AttributeVisitor &visitor) override;
+
+    void validate_and_infer_types() override;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
+    bool get_causal() const { return m_is_causal; }
+
+    std::vector<int64_t> get_input0_transpose_order() const { return m_order_q; }
+    std::vector<int64_t> get_input1_transpose_order() const { return m_order_k; }
+    std::vector<int64_t> get_input2_transpose_order() const { return m_order_v; }
+    std::vector<int64_t> get_output_transpose_order() const { return m_order_out; }
+    ov::element::Type get_output_type() const { return m_output_type; }
+
+    static std::vector<int64_t> default_order(size_t rank) {
+        std::vector<int64_t> order(rank);
+        std::iota(order.begin(), order.end(), 0);
+        return order;
+    }
+
+protected:
+    std::vector<int64_t> m_order_q;
+    std::vector<int64_t> m_order_k;
+    std::vector<int64_t> m_order_v;
+    std::vector<int64_t> m_order_out;
+    bool m_is_causal;
+    ov::element::Type m_output_type;
+};
+
+std::vector<ov::PartialShape> shape_infer(const SDPA* op,
+                                          std::vector<ov::PartialShape> input_shapes,
+                                          const std::vector<int64_t>& order_q,
+                                          const std::vector<int64_t>& order_k,
+                                          const std::vector<int64_t>& order_v,
+                                          const std::vector<int64_t>& order_out);
+
+
+}   // namespace op
+}   // namespace intel_gpu
+}   // namespace ov
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
index 68cb607b116f24..7979870275d240 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
@@ -263,9 +263,10 @@ REGISTER_FACTORY(v12, ScatterElementsUpdate);
 
 // ------------------------------ Supported v13 ops ----------------------------- //
 REGISTER_FACTORY(v13, Multinomial);
+REGISTER_FACTORY(v13, ScaledDotProductAttention);
 
 // ------------------------------ Supported v14 ops ----------------------------- //
-REGISTER_FACTORY(v14, ROIAlignRotated);
+REGISTER_FACTORY(v15, ROIAlignRotated);
 
 // --------------------------- Supported internal ops --------------------------- //
 REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);
@@ -283,3 +284,4 @@ REGISTER_FACTORY(internal, SwiGLU);
 REGISTER_FACTORY(internal, IndirectGemm);
 REGISTER_FACTORY(internal, Convolution);
 REGISTER_FACTORY(internal, Placeholder);
+REGISTER_FACTORY(internal, SDPA);
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp
new file mode 100644
index 00000000000000..f4f32a6af37d87
--- /dev/null
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "primitive.hpp"
+
+namespace cldnn {
+
+struct scaled_dot_product_attention : public primitive_base<scaled_dot_product_attention> {
+    CLDNN_DECLARE_PRIMITIVE(scaled_dot_product_attention)
+
+    scaled_dot_product_attention() : primitive_base("", {}) {}
+
+    /// @brief Constructs scaled_dot_product_attention primitive.
+    /// @param id This primitive id.
+    /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale]).
+    /// @param is_causal If true, assumes causal attention masking. In this case attention_mask input is ignored.
+    scaled_dot_product_attention(const primitive_id& id,
+                                 const std::vector<cldnn::input_info> inputs,
+                                 bool is_causal,
+                                 const std::vector<int64_t>& input_q_transpose_order = {},
+                                 const std::vector<int64_t>& input_k_transpose_order = {},
+                                 const std::vector<int64_t>& input_v_transpose_order = {},
+                                 const std::vector<int64_t>& output_transpose_order = {},
+                                 const padding& output_padding = padding())
+        : primitive_base(id, inputs, {output_padding})
+        , is_causal(is_causal)
+        , has_attn_mask_input(inputs.size() > 3)
+        , has_scale_input(inputs.size() > 4)
+        , input_q_transpose_order(input_q_transpose_order)
+        , input_k_transpose_order(input_k_transpose_order)
+        , input_v_transpose_order(input_v_transpose_order)
+        , output_transpose_order(output_transpose_order) {}
+
+
+    bool is_causal = false;
+    bool has_attn_mask_input = false;
+    bool has_scale_input = false;
+
+    std::vector<int64_t> input_q_transpose_order;
+    std::vector<int64_t> input_k_transpose_order;
+    std::vector<int64_t> input_v_transpose_order;
+    std::vector<int64_t> output_transpose_order;
+
+    size_t hash() const override {
+        size_t seed = primitive::hash();
+        seed = hash_combine(seed, is_causal);
+        seed = hash_combine(seed, has_attn_mask_input);
+        seed = hash_combine(seed, has_scale_input);
+        seed = hash_range(seed, input_q_transpose_order.begin(), input_q_transpose_order.end());
+        seed = hash_range(seed, input_k_transpose_order.begin(), input_k_transpose_order.end());
+        seed = hash_range(seed, input_v_transpose_order.begin(), input_v_transpose_order.end());
+        seed = hash_range(seed, output_transpose_order.begin(), output_transpose_order.end());
+        return seed;
+    }
+
+    bool operator==(const primitive& rhs) const override {
+        if (!compare_common_params(rhs))
+            return false;
+
+        auto rhs_casted = downcast<const scaled_dot_product_attention>(rhs);
+
+        return is_causal == rhs_casted.is_causal &&
+               has_attn_mask_input == rhs_casted.has_attn_mask_input &&
+               has_scale_input == rhs_casted.has_scale_input &&
+               input_q_transpose_order == rhs_casted.input_q_transpose_order &&
+               input_k_transpose_order == rhs_casted.input_k_transpose_order &&
+               input_v_transpose_order == rhs_casted.input_v_transpose_order &&
+               output_transpose_order == rhs_casted.output_transpose_order;
+    }
+
+    void save(BinaryOutputBuffer& ob) const override {
+        primitive_base<scaled_dot_product_attention>::save(ob);
+        ob << is_causal;
+        ob << has_attn_mask_input;
+        ob << has_scale_input;
+        ob << input_q_transpose_order;
+        ob << input_k_transpose_order;
+        ob << input_v_transpose_order;
+        ob << output_transpose_order;
+    }
+
+    void load(BinaryInputBuffer& ib) override {
+        primitive_base<scaled_dot_product_attention>::load(ib);
+        ib >> is_causal;
+        ib >> has_attn_mask_input;
+        ib >> has_scale_input;
+        ib >> input_q_transpose_order;
+        ib >> input_k_transpose_order;
+        ib >> input_v_transpose_order;
+        ib >> output_transpose_order;
+    }
+};
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/gemm.cpp b/src/plugins/intel_gpu/src/graph/gemm.cpp
index 4af921d566bffc..a8b196bd45885f 100644
--- a/src/plugins/intel_gpu/src/graph/gemm.cpp
+++ b/src/plugins/intel_gpu/src/graph/gemm.cpp
@@ -272,6 +272,9 @@ std::string gemm_inst::to_string(gemm_node const& node) {
     gemm_info.add("transpose_input1", transpose_input1);
     gemm_info.add("indirect_input0", indirect_input0);
     gemm_info.add("indirect_input1", indirect_input1);
+    gemm_info.add("trasnpose_order_input0", desc->input0_transpose_order);
+    gemm_info.add("trasnpose_order_input1", desc->input1_transpose_order);
+    gemm_info.add("trasnpose_order_output", desc->output_transpose_order);
     node_info->add("gemm info", gemm_info);
     node_info->dump(primitive_description);
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index 8bab7e44dca4fa..1a235f1293f382 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -681,14 +681,6 @@ void prepare_buffer_fusing::run(program& p) {
                 if (gather_prim) {
                     update_dep(gather_prim);
                 }
-
-                // Fallback to ocl impl since oneDNN doesn't support dynamic paddings
-                for (auto user : node.get_users()) {
-                    if (user->get_preferred_impl_type() == impl_types::onednn) {
-                        GPU_DEBUG_TRACE_DETAIL << user->id() << ": change impl to ocl because of dynamic input paddings\n";
-                        user->set_preferred_impl_type(impl_types::ocl);
-                    }
-                }
             }
         });
         program_helpers::do_for_types<read_value>(*node, [](read_value_node& node) {
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
index 40264d856035e2..855ae9c421b235 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
@@ -93,6 +93,7 @@ void register_implementations() {
     REGISTER_OCL(eye);
     REGISTER_OCL(unique_count);
     REGISTER_OCL(unique_gather);
+    REGISTER_OCL(scaled_dot_product_attention);
 }
 
 }  // namespace ocl
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
index a2f3202f816671..f0d2a72e51d848 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
@@ -74,6 +74,7 @@
 #include "intel_gpu/primitives/eye.hpp"
 #include "intel_gpu/primitives/unique.hpp"
 #include "intel_gpu/primitives/kv_cache.hpp"
+#include "intel_gpu/primitives/scaled_dot_product_attention.hpp"
 
 namespace cldnn {
 namespace ocl {
@@ -172,6 +173,7 @@ REGISTER_OCL(gather_nonzero);
 REGISTER_OCL(eye);
 REGISTER_OCL(unique_count);
 REGISTER_OCL(unique_gather);
+REGISTER_OCL(scaled_dot_product_attention);
 
 #undef REGISTER_OCL
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
new file mode 100644
index 00000000000000..d60098aca74588
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
@@ -0,0 +1,138 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "primitive_base.hpp"
+#include "scaled_dot_product_attention_inst.h"
+#include "sdpa/sdpa_kernel_selector.h"
+#include "sdpa/sdpa_kernel_base.h"
+
+namespace cldnn {
+namespace ocl {
+struct scaled_dot_product_attention_impl : typed_primitive_impl_ocl<scaled_dot_product_attention> {
+    using parent = typed_primitive_impl_ocl<scaled_dot_product_attention>;
+    using parent::parent;
+    using kernel_selector_t = kernel_selector::sdpa_kernel_selector;
+    using kernel_params_t = kernel_selector::sdpa_params;
+
+    DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::scaled_dot_product_attention_impl)
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<scaled_dot_product_attention_impl>(*this);
+    }
+
+    void load(BinaryInputBuffer& ib) override {
+        parent::load(ib);
+        if (is_dynamic()) {
+            auto& kernel_selector = kernel_selector_t::Instance();
+            auto kernel_impl = kernel_selector.GetImplementation(_kernel_data.kernelName);
+            kernel_impl->GetUpdateDispatchDataFunc(_kernel_data);
+        }
+    }
+
+    static kernel_selector::sdpa_configuration get_sdpa_configuration(const kernel_impl_params& impl_param) {
+        kernel_selector::sdpa_configuration config;
+
+        auto transpose_pshape = [](const ov::PartialShape& pshape, const std::vector<int64_t>& order) {
+            if (order.empty())
+                return pshape;
+
+            auto transposed_pshape = ov::PartialShape::dynamic(pshape.rank());
+            for (size_t i = 0; i < order.size(); i++) {
+                transposed_pshape[i] = pshape[order[i]];
+            }
+            return transposed_pshape;
+        };
+
+        const auto& prim = impl_param.typed_desc<scaled_dot_product_attention>();
+        const auto query_shape = transpose_pshape(impl_param.get_input_layout(0).get_partial_shape(), prim->input_q_transpose_order);
+        const auto key_shape = transpose_pshape(impl_param.get_input_layout(1).get_partial_shape(), prim->input_k_transpose_order);
+        const auto value_shape = transpose_pshape(impl_param.get_input_layout(2).get_partial_shape(), prim->input_v_transpose_order);
+
+        OPENVINO_ASSERT(key_shape == value_shape, "[GPU] The shapes of key and value inputs are expected to be equal");
+        for (size_t i = 0; i < query_shape.size(); ++i) {
+            if (query_shape[i].is_static() && key_shape[i].is_static() && value_shape[i].is_static()) {
+                if (query_shape[i].get_length() > key_shape[i].get_length()) {
+                    config.broadcast_axis = prim->input_k_transpose_order[i];
+                    config.group_size = query_shape[i].get_length() / key_shape[i].get_length();
+                }
+            }
+        }
+
+        if (query_shape[query_shape.size() - 1].is_static())
+            config.head_size = query_shape[query_shape.size() - 1].get_length();
+
+        config.is_causal = prim->is_causal;
+
+        return config;
+    }
+
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_dynamic) {
+        auto params = get_default_params<kernel_selector::sdpa_params>(impl_param, is_dynamic);
+
+        const auto inputs_num = impl_param.input_layouts.size();
+        params.inputs.resize(inputs_num);
+        for (size_t i = 0; i < inputs_num; i++) {
+            params.inputs[i] = convert_data_tensor(impl_param.get_input_layout(i));
+        }
+
+        params.conf = get_sdpa_configuration(impl_param);
+
+        const auto& prim = impl_param.typed_desc<scaled_dot_product_attention>();
+        params.input0_order = prim->input_q_transpose_order;
+        params.input1_order = prim->input_k_transpose_order;
+        params.input2_order = prim->input_v_transpose_order;
+        params.output_order = prim->output_transpose_order;
+
+        params.set_dynamic_shape_offsets();
+
+        return params;
+    }
+
+    static std::unique_ptr<primitive_impl> create(const typed_program_node<scaled_dot_product_attention>& arg, const kernel_impl_params& impl_param) {
+        auto sdpa_kernel_params = get_kernel_params(impl_param, impl_param.is_dynamic());
+        auto& sdpa_kernel_selector = kernel_selector_t::Instance();
+        auto kd = sdpa_kernel_selector.get_best_kernel(sdpa_kernel_params);
+
+        return cldnn::make_unique<scaled_dot_product_attention_impl>(kd);
+    }
+
+    void update_dispatch_data(const kernel_impl_params& impl_param) override {
+       auto kernel_params = get_kernel_params(impl_param, true);
+       (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
+    }
+};
+
+namespace detail {
+
+attach_scaled_dot_product_attention_impl::attach_scaled_dot_product_attention_impl() {
+    using sdpa_prim = scaled_dot_product_attention;
+
+    auto types = {
+        data_types::f32,
+        data_types::f16,
+    };
+
+    auto formats = {
+        format::bfyx,
+    };
+
+    implementation_map<sdpa_prim>::add(impl_types::ocl,
+                                       shape_types::static_shape,
+                                       scaled_dot_product_attention_impl::create,
+                                       types,
+                                       formats);
+
+    implementation_map<sdpa_prim>::add(impl_types::ocl,
+                                       shape_types::dynamic_shape,
+                                       scaled_dot_product_attention_impl::create,
+                                       types,
+                                       formats);
+}
+
+}  // namespace detail
+}  // namespace ocl
+}  // namespace cldnn
+
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::scaled_dot_product_attention_impl)
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::scaled_dot_product_attention)
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp
index cdc78316b03d47..10c1a970d1793b 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp
@@ -64,6 +64,8 @@ struct gemm_onednn : typed_primitive_onednn_impl<gemm> {
                                       dnnl::memory::data_type& out_dt,
                                       dnnl::memory::dims& in0_dims,
                                       dnnl::memory::dims& in1_dims,
+                                      dnnl::memory::dims& in0_strides,
+                                      dnnl::memory::dims& in1_strides,
                                       dnnl::memory::dims& out_dims,
                                       dnnl::memory::format_tag& in0_fmt,
                                       dnnl::memory::format_tag& in1_fmt,
@@ -111,6 +113,22 @@ struct gemm_onednn : typed_primitive_onednn_impl<gemm> {
         in1_fmt = onednn::convert_gemm_data_format(in1_dims, in1_l.format);
         out_fmt = onednn::convert_gemm_data_format(out_dims, out_l.format);
 
+        if (in0_l.data_padding) {
+            dnnl::memory::dims in0_padded_dims = onednn::convert_gemm_tensor(in0_l.get_buffer_size(), rank, batched_dims_can_be_removed);
+            if (prim->transpose_input0) {
+                std::swap(in0_padded_dims[in0_padded_dims.size() - 1], in0_padded_dims[in0_padded_dims.size() - 2]);
+            }
+            in0_strides = onednn::get_strides(in0_padded_dims);
+        }
+
+        if (in1_l.data_padding) {
+            dnnl::memory::dims in1_padded_dims = onednn::convert_gemm_tensor(in1_l.get_buffer_size(), rank, batched_dims_can_be_removed);
+            if (prim->transpose_input1) {
+                std::swap(in1_padded_dims[in1_padded_dims.size() - 1], in1_padded_dims[in1_padded_dims.size() - 2]);
+            }
+            in1_strides = onednn::get_strides(in1_padded_dims);
+        }
+
         if (prim->transpose_input0) {
             in0_fmt = transpose_format(in0_fmt);
             std::swap(in0_dims[in0_dims.size() - 1], in0_dims[in0_dims.size() - 2]);
@@ -130,6 +148,19 @@ struct gemm_onednn : typed_primitive_onednn_impl<gemm> {
         }
     }
 
+    static dnnl::memory::desc get_input_memory_desc(const dnnl::memory::dims& dims,
+                                                    dnnl::memory::data_type dt,
+                                                    dnnl::memory::format_tag fmt,
+                                                    const dnnl::memory::dims& strides) {
+        dnnl::memory::desc res;
+        if (strides.empty()) {
+            res = dnnl::memory::desc(dims, dt, fmt);
+        } else {
+            res = dnnl::memory::desc(dims, dt, strides);
+        }
+        return res;
+    }
+
     static std::shared_ptr<dnnl::matmul::primitive_desc> get_gemm_primitive_descriptor(const kernel_impl_params& impl_params,
                                                                                        const dnnl::primitive_attr& attr = dnnl::primitive_attr()) {
         auto& engine = impl_params.prog->get_engine();
@@ -146,16 +177,19 @@ struct gemm_onednn : typed_primitive_onednn_impl<gemm> {
         dnnl::memory::dims out_dims;
         dnnl::memory::dims bias_dims;
 
+        dnnl::memory::dims in0_strides;
+        dnnl::memory::dims in1_strides;
+
         dnnl::memory::format_tag in0_fmt;
         dnnl::memory::format_tag in1_fmt;
         dnnl::memory::format_tag out_fmt;
         dnnl::memory::format_tag bias_fmt;
 
-        get_gemm_primitive_md(impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, out_dims, in0_fmt, in1_fmt, out_fmt,
-                              gemm_with_bias, bias_dt, bias_dims, bias_fmt);
+        get_gemm_primitive_md(impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, in0_strides, in1_strides,
+                              out_dims, in0_fmt, in1_fmt, out_fmt, gemm_with_bias, bias_dt, bias_dims, bias_fmt);
 
-        dnnl::memory::desc in0_md(in0_dims, in0_dt, in0_fmt);
-        dnnl::memory::desc in1_md(in1_dims, in1_dt, in1_fmt);
+        dnnl::memory::desc in0_md = get_input_memory_desc(in0_dims, in0_dt, in0_fmt, in0_strides);
+        dnnl::memory::desc in1_md = get_input_memory_desc(in1_dims, in1_dt, in1_fmt, in1_strides);
         dnnl::memory::desc out_md(out_dims, out_dt, out_fmt);
 
         if (gemm_with_bias) {
@@ -199,13 +233,16 @@ struct gemm_onednn : typed_primitive_onednn_impl<gemm> {
         dnnl::memory::dims out_dims;
         dnnl::memory::dims bias_dims;
 
+        dnnl::memory::dims in0_strides;
+        dnnl::memory::dims in1_strides;
+
         dnnl::memory::format_tag in0_fmt;
         dnnl::memory::format_tag in1_fmt;
         dnnl::memory::format_tag out_fmt;
         dnnl::memory::format_tag bias_fmt;
 
-        get_gemm_primitive_md(*impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, out_dims, in0_fmt, in1_fmt, out_fmt,
-                              gemm_with_bias, bias_dt, bias_dims, bias_fmt);
+        get_gemm_primitive_md(*impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, in0_strides, in1_strides,
+                              out_dims, in0_fmt, in1_fmt, out_fmt, gemm_with_bias, bias_dt, bias_dims, bias_fmt);
 
         ob << make_data(&in0_dt, sizeof(dnnl::memory::data_type));
         ob << make_data(&in1_dt, sizeof(dnnl::memory::data_type));
@@ -215,6 +252,9 @@ struct gemm_onednn : typed_primitive_onednn_impl<gemm> {
         ob << in1_dims;
         ob << out_dims;
 
+        ob << in0_strides;
+        ob << in1_strides;
+
         ob << make_data(&in0_fmt, sizeof(dnnl::memory::format_tag));
         ob << make_data(&in1_fmt, sizeof(dnnl::memory::format_tag));
         ob << make_data(&out_fmt, sizeof(dnnl::memory::format_tag));
@@ -248,6 +288,9 @@ struct gemm_onednn : typed_primitive_onednn_impl<gemm> {
         dnnl::memory::dims out_dims;
         dnnl::memory::dims bias_dims;
 
+        dnnl::memory::dims in0_strides;
+        dnnl::memory::dims in1_strides;
+
         dnnl::memory::format_tag in0_fmt = dnnl::memory::format_tag::undef;
         dnnl::memory::format_tag in1_fmt = dnnl::memory::format_tag::undef;
         dnnl::memory::format_tag out_fmt = dnnl::memory::format_tag::undef;
@@ -261,6 +304,9 @@ struct gemm_onednn : typed_primitive_onednn_impl<gemm> {
         ib >> in1_dims;
         ib >> out_dims;
 
+        ib >> in0_strides;
+        ib >> in1_strides;
+
         ib >> make_data(&in0_fmt, sizeof(dnnl::memory::format_tag));
         ib >> make_data(&in1_fmt, sizeof(dnnl::memory::format_tag));
         ib >> make_data(&out_fmt, sizeof(dnnl::memory::format_tag));
@@ -271,8 +317,8 @@ struct gemm_onednn : typed_primitive_onednn_impl<gemm> {
             ib >> make_data(&bias_fmt, sizeof(dnnl::memory::format_tag));
         }
 
-        dnnl::memory::desc in0_md(in0_dims, in0_dt, in0_fmt);
-        dnnl::memory::desc in1_md(in1_dims, in1_dt, in1_fmt);
+        dnnl::memory::desc in0_md = get_input_memory_desc(in0_dims, in0_dt, in0_fmt, in0_strides);
+        dnnl::memory::desc in1_md = get_input_memory_desc(in1_dims, in1_dt, in1_fmt, in1_strides);
         dnnl::memory::desc out_md(out_dims, out_dt, out_fmt);
 
         if (gemm_with_bias) {
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
index f77b4469b1f619..6214a8db4d8255 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
@@ -94,6 +94,12 @@ dnnl::memory::dims flatten_tensor(cldnn::tensor t) {
     return {static_cast<int64_t>(t.count())};
 }
 
+dnnl::memory::dims get_strides(dnnl::memory::dims dims) {
+    dnnl::memory::dims strides(dims.size(), dnnl::memory::dim(1));
+    std::partial_sum(dims.rbegin(), dims.rend() - 1, strides.rbegin() + 1, std::multiplies<dnnl::memory::dim>());
+    return strides;
+}
+
 dnnl::memory::data_type convert_data_type(cldnn::data_types dt) {
     switch (dt) {
         case cldnn::data_types::f32: return dnnl::memory::data_type::f32;
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp
index a789107e2cf2bb..e8127b698f57d5 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp
@@ -28,6 +28,7 @@ dnnl::memory::dims convert_tensor(cldnn::tensor t, size_t dims = 2, bool is_grou
 dnnl::memory::dims convert_gemm_tensor(cldnn::tensor t, size_t dims, bool batched_dims_can_be_removed);
 dnnl::memory::dims convert_spatials(cldnn::tensor t, size_t dims = 2);
 dnnl::memory::dims flatten_tensor(cldnn::tensor t);
+dnnl::memory::dims get_strides(dnnl::memory::dims dims);
 dnnl::memory::data_type convert_data_type(cldnn::data_types dt);
 dnnl::memory::format_tag convert_data_format(cldnn::format fmt);
 cldnn::format convert_data_format(dnnl::memory::format_tag fmt);
diff --git a/src/plugins/intel_gpu/src/graph/include/scaled_dot_product_attention_inst.h b/src/plugins/intel_gpu/src/graph/include/scaled_dot_product_attention_inst.h
new file mode 100644
index 00000000000000..cecb2a0f609550
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/include/scaled_dot_product_attention_inst.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "intel_gpu/primitives/scaled_dot_product_attention.hpp"
+#include "primitive_inst.h"
+
+#include <string>
+
+namespace cldnn {
+
+template <>
+struct typed_program_node<scaled_dot_product_attention> : public typed_program_node_base<scaled_dot_product_attention> {
+    using parent = typed_program_node_base<scaled_dot_product_attention>;
+
+public:
+    using parent::parent;
+
+    program_node& input(size_t index = 0) const { return get_dependency(index); }
+    std::vector<size_t> get_shape_infer_dependencies() const override { return {}; }
+};
+using scaled_dot_product_attention_node = typed_program_node<scaled_dot_product_attention>;
+
+template <>
+class typed_primitive_inst<scaled_dot_product_attention> : public typed_primitive_inst_base<scaled_dot_product_attention> {
+    using parent = typed_primitive_inst_base<scaled_dot_product_attention>;
+    using parent::parent;
+
+public:
+    template<typename ShapeType>
+    static std::vector<layout> calc_output_layouts(scaled_dot_product_attention_node const& /*node*/, const kernel_impl_params& impl_param);
+    static layout calc_output_layout(scaled_dot_product_attention_node const& node, kernel_impl_params const& impl_param);
+    static std::string to_string(scaled_dot_product_attention_node const& node);
+
+    typed_primitive_inst(network& network, scaled_dot_product_attention_node const& desc);
+};
+
+using scaled_dot_product_attention_inst = typed_primitive_inst<scaled_dot_product_attention>;
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/roi_align.cpp b/src/plugins/intel_gpu/src/graph/roi_align.cpp
index 106f92623142a6..22e5dd4bf98f6c 100644
--- a/src/plugins/intel_gpu/src/graph/roi_align.cpp
+++ b/src/plugins/intel_gpu/src/graph/roi_align.cpp
@@ -50,7 +50,7 @@ std::vector<layout> roi_align_inst::calc_output_layouts(roi_align_node const& no
     output_shapes = shape_infer(&op, input_shapes);
 
     if (primitive->roi_mode == roi_align::ROIMode::rotated) {
-        PERFORM_SHAPE_INFERENCE(ov::op::v14::ROIAlignRotated);
+        PERFORM_SHAPE_INFERENCE(ov::op::v15::ROIAlignRotated);
     } else {
         PERFORM_SHAPE_INFERENCE(ov::op::v3::ROIAlign);
     }
diff --git a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp
new file mode 100644
index 00000000000000..42e5aeb9f1302e
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "scaled_dot_product_attention_inst.h"
+
+#include "primitive_type_base.h"
+#include "intel_gpu/runtime/error_handler.hpp"
+#include "json_object.h"
+#include <string>
+#include <vector>
+
+#include "scaled_dot_product_attention_shape_inference.hpp"
+#include "intel_gpu/op/sdpa.hpp"
+
+namespace cldnn {
+GPU_DEFINE_PRIMITIVE_TYPE_ID(scaled_dot_product_attention)
+
+layout scaled_dot_product_attention_inst::calc_output_layout(scaled_dot_product_attention_node const& /* node */,
+                                                             kernel_impl_params const& impl_param) {
+    auto desc = impl_param.typed_desc<scaled_dot_product_attention>();
+
+    return impl_param.get_input_layout(0);
+}
+
+template<typename ShapeType>
+std::vector<layout> scaled_dot_product_attention_inst::calc_output_layouts(scaled_dot_product_attention_node const& /*node*/,
+                                                                           const kernel_impl_params& impl_param) {
+    auto prim = impl_param.typed_desc<scaled_dot_product_attention>();
+    auto input0_layout = impl_param.get_input_layout(0);
+
+    auto default_out_dt = data_type_traits::is_floating_point(input0_layout.data_type) ? input0_layout.data_type : data_types::f32;
+    auto output_type = prim->output_data_types[0].value_or(default_out_dt);
+
+    if (impl_param.has_fused_primitives()) {
+        output_type = impl_param.get_output_element_type();
+    }
+
+    ov::intel_gpu::op::SDPA op;
+
+    std::vector<ShapeType> input_shapes;
+    for (size_t i = 0; i < impl_param.input_layouts.size(); i++) {
+        input_shapes.push_back(impl_param.get_input_layout(0).get<ShapeType>());
+    }
+
+    std::vector<ShapeType> output_shapes = ov::intel_gpu::op::shape_infer(&op,
+                                                                          input_shapes,
+                                                                          prim->input_q_transpose_order,
+                                                                          prim->input_k_transpose_order,
+                                                                          prim->input_v_transpose_order,
+                                                                          prim->output_transpose_order);
+
+    cldnn::format output_format = input0_layout.format;
+
+    return { layout{output_shapes[0], output_type, output_format, prim->output_paddings[0]} };
+}
+
+template std::vector<layout> scaled_dot_product_attention_inst::calc_output_layouts<ov::PartialShape>(scaled_dot_product_attention_node const& node,
+                                                                                                      const kernel_impl_params& impl_param);
+
+std::string scaled_dot_product_attention_inst::to_string(scaled_dot_product_attention_node const& node) {
+    auto desc = node.get_primitive();
+    auto node_info = node.desc_to_json();
+    auto& input = node.input();
+
+    std::stringstream primitive_description;
+
+    json_composite scaled_dot_product_attention_info;
+    scaled_dot_product_attention_info.add("input id", input.id());
+    scaled_dot_product_attention_info.add("is_causal", desc->is_causal);
+    scaled_dot_product_attention_info.add("has_attn_mask_input", desc->has_attn_mask_input);
+    scaled_dot_product_attention_info.add("has_scale_input", desc->has_scale_input);
+    scaled_dot_product_attention_info.add("input_q_transpose_order", desc->input_q_transpose_order);
+    scaled_dot_product_attention_info.add("input_k_transpose_order", desc->input_k_transpose_order);
+    scaled_dot_product_attention_info.add("input_v_transpose_order", desc->input_v_transpose_order);
+    scaled_dot_product_attention_info.add("output_transpose_order", desc->output_transpose_order);
+
+    node_info->add("scaled_dot_product_attention_info", scaled_dot_product_attention_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+scaled_dot_product_attention_inst::typed_primitive_inst(network& network, scaled_dot_product_attention_node const& node)
+    : parent(network, node) {}
+
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
new file mode 100644
index 00000000000000..14cef4010c6bea
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
@@ -0,0 +1,1169 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+#include "include/batch_headers/common.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
+
+// query_input   [batch, heads_num, q_len, head_size]
+// key_input     [batch, kv_heads_num, kv_len, head_size]
+// value_input   [batch, kv_heads_num, kv_len, head_size]
+// attn_mask     [1, 1, q_len, kv_len]
+// output        [batch, heads_num, q_len, head_size]
+// exp_sums      [batch, heads_num, q_len, partition_idx]
+// max_logits    [batch, heads_num, q_len, partition_idx]
+// tmp_out       [batch, heads_num, q_len, partition_idx, head_size]
+
+
+inline uint FUNC(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#if INPUT0_SIMPLE
+    return GET_DATA_INDEX_6D_SAFE(INPUT0, b, f, w, z, y, x);
+#else
+#if INPUT0_DIMS == 4
+    return INPUT0_GET_INDEX_SAFE(b, f, y, x);
+#elif INPUT0_DIMS == 5
+    return INPUT0_GET_INDEX_SAFE(b, f, z, y, x);
+#elif INPUT0_DIMS == 6
+    return INPUT0_GET_INDEX_SAFE(b, f, w, z, y, x);
+#else
+#   error sdpa_ref.cl : Unsupported input 0 format
+#endif
+#endif
+}
+
+inline uint FUNC(get_input0_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef INPUT0_DIMS_ORDER
+    return FUNC_CALL(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT0_DIMS_ORDER);
+#else
+    return FUNC_CALL(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x);
+#endif
+}
+
+inline uint FUNC(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef DO_BROADCAST_KEY_VALUE
+    DO_BROADCAST_KEY_VALUE;
+#endif
+#if INPUT1_SIMPLE
+    return GET_DATA_INDEX_6D_SAFE(INPUT1, b, f, w, z, y, x);
+#else
+#if INPUT1_DIMS == 4
+    return INPUT1_GET_INDEX_SAFE(b, f, y, x);
+#elif INPUT1_DIMS == 5
+    return INPUT1_GET_INDEX_SAFE(b, f, z, y, x);
+#elif INPUT1_DIMS == 6
+    return INPUT1_GET_INDEX_SAFE(b, f, w, z, y, x);
+#else
+#   error sdpa_ref.cl : Unsupported input 1 format
+#endif
+#endif
+}
+
+inline uint FUNC(get_input1_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef INPUT1_DIMS_ORDER
+    return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT1_DIMS_ORDER);
+#else
+    return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x);
+#endif
+}
+
+inline uint FUNC(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef DO_BROADCAST_KEY_VALUE
+    DO_BROADCAST_KEY_VALUE;
+#endif
+#if INPUT2_SIMPLE
+    return GET_DATA_INDEX_6D_SAFE(INPUT2, b, f, w, z, y, x);
+#else
+#if INPUT2_DIMS == 4
+    return INPUT2_GET_INDEX_SAFE(b, f, y, x);
+#elif INPUT2_DIMS == 5
+    return INPUT2_GET_INDEX_SAFE(b, f, z, y, x);
+#elif INPUT2_DIMS == 6
+    return INPUT2_GET_INDEX_SAFE(b, f, w, z, y, x);
+#else
+#   error sdpa_ref.cl : Unsupported input 1 format
+#endif
+#endif
+}
+
+inline uint FUNC(get_input2_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef INPUT2_DIMS_ORDER
+    return FUNC_CALL(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT2_DIMS_ORDER);
+#else
+    return FUNC_CALL(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x);
+#endif
+}
+
+#define VALUE_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT2_TYPE, 1, ptr, offset)
+#define SUBGROUPS_PER_WG (HEAD_SIZE / SUBGROUP_SIZE)
+
+#ifdef SDPA_STAGE_0
+
+#if TARGET_SEQ_LEN_BLOCK_SIZE == 1
+/* This version is used for 2nd token */
+
+REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE)
+KERNEL(sdpa_opt)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* query_input,
+    const __global INPUT1_TYPE* key_input,
+    const __global INPUT2_TYPE* value_input,
+#if HAS_ATTN_MASK_INPUT
+    const __global INPUT3_TYPE* attn_mask,
+#endif
+#if HAS_SCALE_INPUT
+    const __global INPUT4_TYPE* scale,
+#endif
+    __global OUTPUT_TYPE* output,
+    __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums,
+    __global SOFTMAX_ACCUMULATOR_TYPE* max_logits,
+    __global OUTPUT_TYPE* tmp_out
+)
+{
+    const uint batch_idx = get_global_id(0);
+    const uint b0_idx = batch_idx / NUM_HEADS; /* BATCH dim */
+    const uint b1_idx = batch_idx % NUM_HEADS; /* HEADS_NUM dim */
+
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+    const uint target_seq_idx = (uint)get_global_id(1) * TARGET_SEQ_LEN_BLOCK_SIZE;
+#else
+    const uint target_seq_idx = get_global_id(1);
+#endif
+    const uint lid = get_local_id(2);
+    const uint head_size_idx = lid;
+
+    const uint sgid = get_sub_group_id();
+    const uint sglid = get_sub_group_local_id();
+
+    const uint partition_idx = get_group_id(2);
+    const uint num_of_partitions = get_num_groups(2);
+    const uint wi_num_per_partition = get_local_size(2);
+
+    const uint start_partition_idx = partition_idx * SEQ_LEN_PARTITION_SIZE;
+    const uint partition_seq_len =
+        ((partition_idx + 1) < num_of_partitions) ? (SEQ_LEN_PARTITION_SIZE)
+                                                  : (SOURCE_SEQ_LEN - partition_idx * SEQ_LEN_PARTITION_SIZE);
+
+    // SLM for query inputs
+    __local INPUT0_TYPE query_local[HEAD_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE];
+    // SLM for intermediate QK results
+    __local OUTPUT_TYPE qk_local[SEQ_LEN_PARTITION_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE];
+    // SLM buffers for SoftMax calculation and qk_max/qk_sums results aggregation across all WG
+    __local SOFTMAX_ACCUMULATOR_TYPE qk_max_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE];
+    __local SOFTMAX_ACCUMULATOR_TYPE qk_sum_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE];
+
+    {
+        // Gemm1 and SoftMax calculation
+
+        SOFTMAX_ACCUMULATOR_TYPE qk_max[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_MIN};
+        for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) {
+            qk_max[i] = SOFTMAX_ACCUMULATOR_VAL_MIN;
+        }
+
+        {
+            // Gemm1 calculation
+#if HAS_SCALE_INPUT
+            const OUTPUT_TYPE scale_val = *scale;
+#else
+            const OUTPUT_TYPE scale_val = OUTPUT_VAL_ONE / sqrt(TO_OUTPUT_TYPE(HEAD_SIZE));
+#endif
+            {
+                // Query input loading to SLM
+                #define QUERY_STEP_LOCAL SUBGROUP_SIZE * SUBGROUPS_PER_WG
+                uint query_local_offset = sgid * SUBGROUP_SIZE + sglid;
+
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+                const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+#else
+                const uint seq_idx_end = 1;
+#endif
+#ifdef INPUT0_DIMS_ORDER
+                uint query_offset = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, target_seq_idx, (sgid * SUBGROUP_SIZE));
+                uint query_offset_next_seq = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, target_seq_idx + 1, (sgid * SUBGROUP_SIZE));
+                const uint query_pitch = query_offset_next_seq - query_offset;
+#else
+                uint query_offset = INPUT0_GET_INDEX(b0_idx, b1_idx, target_seq_idx, (sgid * SUBGROUP_SIZE));
+                const uint query_pitch = QUERY_STEP_LOCAL;
+#endif
+                for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                    #define QUERY_BLOCK_SIZE 1
+
+                    INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset);
+
+                    query_local[query_local_offset] = val;
+                    query_local_offset += QUERY_STEP_LOCAL;
+                    query_offset += query_pitch;
+                }
+                #undef QUERY_BLOCK_SIZE
+                #undef QUERY_STEP
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            // Main Gemm1 calculation loop
+            // Each SG performs element-wise multiplications of Q[HEAD_SIZE]xK[HEAD_SIZE] values
+            // HEAD_SIZE / SUBGROUPS_PER_WG times in the loop and saves the result to the qk_local SLM buffer
+            for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE)) {
+#ifdef INPUT1_DIMS_ORDER
+                uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0);
+#else
+                uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0);
+#endif
+
+                INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO};
+
+                uint head_idx_index = 0;
+                #define KEY_BLOCK_SIZE 8
+                for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) {
+                    #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset);
+                    #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE)
+                    #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE)
+
+                    KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+
+                    uint query_offset = head_idx_index + sglid;
+                    unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                        QUERY_BLOCK query_vals_reg;
+                        unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) {
+                            query_vals_reg[i] = query_local[query_offset + i * SUBGROUP_SIZE];
+                        }
+
+                        unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) {
+                            acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]);
+                        }
+
+                        query_offset += HEAD_SIZE;
+                    }
+                }
+
+                #define KEY_BLOCK_SIZE 4
+                for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) {
+                    #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset);
+                    #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE)
+                    #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE)
+
+                    KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+
+                    uint query_offset = head_idx_index + sglid;
+                    unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                        QUERY_BLOCK query_vals_reg;
+                        unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) {
+                            query_vals_reg[i] = query_local[query_offset + i * SUBGROUP_SIZE];
+                        }
+
+                        unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) {
+                            acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]);
+                        }
+
+                        query_offset += HEAD_SIZE;
+                    }
+                }
+
+                #define KEY_BLOCK_SIZE 2
+                for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) {
+                    #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset);
+                    #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE)
+                    #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE)
+
+                    KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+
+                    uint query_offset = head_idx_index + sglid;
+                    unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                        QUERY_BLOCK query_vals_reg;
+                        unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) {
+                            query_vals_reg[i] = query_local[query_offset + i * SUBGROUP_SIZE];
+                        }
+
+                        unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) {
+                            acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]);
+                        }
+
+                        query_offset += HEAD_SIZE;
+                    }
+                }
+
+                #define KEY_BLOCK_SIZE 1
+                for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) {
+                    #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset);
+                    #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE)
+                    #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE)
+
+                    KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+
+                    uint query_offset = head_idx_index + sglid;
+                    unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                        QUERY_BLOCK query_vals_reg;
+                        unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) {
+                            query_vals_reg = query_local[query_offset + i * SUBGROUP_SIZE];
+                        }
+
+                        acc[seq_idx] = mad(query_vals_reg, key_vals, acc[seq_idx]);
+                        query_offset += HEAD_SIZE;
+                    }
+                }
+
+                // Sum up all accumulators accross single SG and save result to SLM
+                unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                    acc[seq_idx] = sub_group_reduce_add(acc[seq_idx]);
+                    qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len] = acc[seq_idx];
+                }
+            }
+
+            {
+                // Wait until all SG finishes their calculations and apply scale and attention mask to the results
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                INPUT0_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE];
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+                const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+#else
+                const uint seq_idx_end = 1;
+#endif
+                for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                    // Iterate over all values QK values in SLM and apply scale and attention mask
+                    for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE)) {
+                        // Read value from SLM and apply scale
+                        qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len];
+                        qk_val[seq_idx] *= scale_val;
+
+                        // Apply attention mask
+#if IS_CAUSAL
+                        if (start_partition_idx + seq_len > target_seq_idx + seq_idx)
+                            qk_val[seq_idx] += INPUT0_VAL_MIN;
+#elif !IS_CAUSAL && HAS_ATTN_MASK_INPUT
+                        const uint attn_mask_offset = INPUT3_GET_INDEX_SAFE(b0_idx, b1_idx, target_seq_idx + seq_idx, start_partition_idx + seq_len);
+                        qk_val[seq_idx] += attn_mask[attn_mask_offset];
+#endif
+
+                        // Update qk_max value
+                        qk_max[seq_idx] = SOFTMAX_ACCUMULATOR_MAX_FUNC(qk_max[seq_idx], TO_SOFTMAX_ACCUMULATOR_TYPE(qk_val[seq_idx]));
+
+                        // Save modified qk value back to SLM
+                        qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len] = qk_val[seq_idx];
+                    }
+                }
+            }
+        } // Gemm1 calculation end
+
+        {
+            // SoftMax calculation
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+            const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+#else
+            const uint seq_idx_end = 1;
+#endif
+            // Find the maximum value of qk in the subgroup
+            for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                qk_max[seq_idx] = sub_group_reduce_max(qk_max[seq_idx]);
+            }
+
+            // Find the maximum value of qk across all subgroups in the workgroup
+            if (sglid == 0) {
+                for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                    qk_max_vals[seq_idx * SUBGROUPS_PER_WG + sgid] = qk_max[seq_idx];
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                qk_max[seq_idx] = SOFTMAX_ACCUMULATOR_VAL_MIN;
+
+                if (sglid < SUBGROUPS_PER_WG)
+                    qk_max[seq_idx] = qk_max_vals[seq_idx * SUBGROUPS_PER_WG + sglid];
+
+                // Final maximum value of qk after reduction across all subgroups
+                qk_max[seq_idx] = sub_group_reduce_max(qk_max[seq_idx]);
+            }
+
+            SOFTMAX_ACCUMULATOR_TYPE exp_sum[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_ZERO};
+            const uint qk_num_per_wi = CEIL_DIV(partition_seq_len, SUBGROUPS_PER_WG * SUBGROUP_SIZE);
+            for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) {
+                const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + head_size_idx;
+                if (local_data_idx < partition_seq_len) {
+                    for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                        SOFTMAX_ACCUMULATOR_TYPE qk_new = native_exp(TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) - qk_max[seq_idx]);
+                        qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx] = TO_OUTPUT_TYPE(qk_new);
+
+                        exp_sum[seq_idx] += qk_new;
+                    }
+                }
+            }
+
+            for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                exp_sum[seq_idx] = sub_group_reduce_add(exp_sum[seq_idx]);
+
+                if (sglid == 0)
+                    qk_sum_vals[seq_idx * SUBGROUPS_PER_WG + sgid] = exp_sum[seq_idx];
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                exp_sum[seq_idx] = SOFTMAX_ACCUMULATOR_VAL_ZERO;
+
+                if (sglid < SUBGROUPS_PER_WG)
+                    exp_sum[seq_idx] = qk_sum_vals[seq_idx * SUBGROUPS_PER_WG + sglid];
+
+                // Find the final sum of all exp_sum[seq_idx] values in workgroup
+                exp_sum[seq_idx] = sub_group_reduce_add(exp_sum[seq_idx]);
+            }
+
+            // const SOFTMAX_ACCUMULATOR_TYPE inv_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ONE / exp_sum[seq_idx];
+            for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) {
+                const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + sgid * SUBGROUP_SIZE + sglid;
+                if (local_data_idx < partition_seq_len) {
+                    for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                        SOFTMAX_ACCUMULATOR_TYPE qk_new = TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) / exp_sum[seq_idx];
+                        qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx] = TO_OUTPUT_TYPE(qk_new);
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            {
+                // If the number of partitions is greater than 1, save exm_sums and max_logits to the temporary buffers
+                // Use single WI in the WG, since all the WIs have the same value
+                if (num_of_partitions > 1 && head_size_idx == 0) {
+                    for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                        const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) +
+                                                     b1_idx * (TARGET_SEQ_LEN * num_of_partitions) +
+                                                     (seq_idx + target_seq_idx) * (num_of_partitions) +
+                                                     partition_idx;
+                        exp_sums[exp_sums_offset] = exp_sum[seq_idx];
+
+                        const uint max_logits_offset = exp_sums_offset;
+                        max_logits[max_logits_offset] = qk_max[seq_idx];
+                    }
+                }
+            }
+        } // SoftMax calculation end
+    } // Gemm1 + SoftMax calculations end
+
+    {
+        // Gemm2 calculation
+        OUTPUT_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {OUTPUT_VAL_ZERO};
+
+#ifdef INPUT2_DIMS_ORDER
+        uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0);
+        uint value_offset_next_seq = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0);
+        const uint value_pitch = value_offset_next_seq - value_offset;
+#else
+        const uint value_pitch = HEAD_SIZE;
+#endif
+
+        for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) {
+#ifdef INPUT2_DIMS_ORDER
+            uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx);
+#else
+            uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx);
+#endif
+
+            OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE];
+            unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid];
+            }
+
+            unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) {
+                INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+                unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                    acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]);
+                }
+
+                value_offset += value_pitch;
+            }
+        }
+
+        const uint seq_len_leftovers_start = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE;
+        for (uint seq_len = seq_len_leftovers_start; seq_len < partition_seq_len; seq_len++) {
+#ifdef INPUT2_DIMS_ORDER
+            const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx);
+#else
+            const uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, head_size_idx);
+#endif
+
+            OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE];
+            unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len];
+            }
+
+            INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+
+            unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                acc[seq_idx] = mad(qk_val[seq_idx], value_val, acc[seq_idx]);
+            }
+        }
+
+        // If the number of partitions is greater than 1, save results to the temporary buffer;
+        // otherwise, save results directly to the main output.
+        if (num_of_partitions > 1) {
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+            const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+#else
+            const uint seq_idx_end = 1;
+#endif
+            for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                // Data layout of tmp_output buf: [batch, heads_num, q_len, partition_idx, head_size]
+                const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            (target_seq_idx + seq_idx) * (num_of_partitions * HEAD_SIZE) +
+                                            partition_idx * (HEAD_SIZE) +
+                                            head_size_idx;
+                tmp_out[tmp_out_offset] = acc[seq_idx];
+            }
+        } else {
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+            const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+#else
+            const uint seq_idx_end = 1;
+#endif
+            for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                    const uint output_offset = OUTPUT_GET_INDEX(b0_idx, b1_idx, target_seq_idx + seq_idx, head_size_idx);
+
+                    output[output_offset] = acc[seq_idx];
+            }
+        }
+    } // Gemm2 calculation end
+}
+
+#else
+/* This version is used for 1st token */
+
+REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE)
+KERNEL(sdpa_opt)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* query_input,
+    const __global INPUT1_TYPE* key_input,
+    const __global INPUT2_TYPE* value_input,
+#if HAS_ATTN_MASK_INPUT
+    const __global INPUT3_TYPE* attn_mask,
+#endif
+#if HAS_SCALE_INPUT
+    const __global INPUT4_TYPE* scale,
+#endif
+    __global OUTPUT_TYPE* output,
+    __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums,
+    __global SOFTMAX_ACCUMULATOR_TYPE* max_logits,
+    __global OUTPUT_TYPE* tmp_out
+)
+{
+    const uint batch_idx = get_global_id(0);
+    const uint b0_idx = batch_idx / NUM_HEADS; /* BATCH dim */
+    const uint b1_idx = batch_idx % NUM_HEADS; /* HEADS_NUM dim */
+
+#if TARGET_SEQ_LEN_BLOCK_SIZE != 1 && TARGET_SEQ_LEN_BLOCK_SIZE != 16
+    #error TARGET_SEQ_LEN_BLOCK_SIZE unexpected size
+#endif
+
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+    const uint target_seq_idx = (uint)get_global_id(1) * TARGET_SEQ_LEN_BLOCK_SIZE;
+#else
+    const uint target_seq_idx = get_global_id(1);
+#endif
+    const uint lid = get_local_id(2);
+    const uint head_size_idx = lid;
+
+    const uint sgid = get_sub_group_id();
+    const uint sglid = get_sub_group_local_id();
+
+    const uint partition_idx = get_group_id(2);
+    const uint num_of_partitions = get_num_groups(2);
+    const uint wi_num_per_partition = get_local_size(2);
+
+    const uint start_partition_idx = partition_idx * SEQ_LEN_PARTITION_SIZE;
+    const uint partition_seq_len =
+        ((partition_idx + 1) < num_of_partitions) ? (SEQ_LEN_PARTITION_SIZE)
+                                                  : (SOURCE_SEQ_LEN - partition_idx * SEQ_LEN_PARTITION_SIZE);
+
+    const uint target_seq_len_bs = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+
+    // SLM for query inputs
+    __local INPUT0_TYPE query_local[HEAD_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE];
+    // SLM for intermediate QK results
+    __local OUTPUT_TYPE qk_local[SEQ_LEN_PARTITION_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE];
+    // SLM buffers for SoftMax calculation and qk_max/qk_sums results aggregation across all WG
+    __local SOFTMAX_ACCUMULATOR_TYPE qk_max_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE];
+    __local SOFTMAX_ACCUMULATOR_TYPE qk_sum_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE];
+
+    {
+        // Gemm1 and SoftMax calculation
+
+        SOFTMAX_ACCUMULATOR_TYPE qk_max = SOFTMAX_ACCUMULATOR_VAL_MIN;
+
+        {
+            // Gemm1 calculation
+#if HAS_SCALE_INPUT
+            const OUTPUT_TYPE scale_val = *scale;
+#else
+            const OUTPUT_TYPE scale_val = OUTPUT_VAL_ONE / sqrt(TO_OUTPUT_TYPE(HEAD_SIZE));
+#endif
+            {
+                // Load Query input to SLM and transpose it
+#ifdef INPUT0_DIMS_ORDER
+                uint query_offset = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, target_seq_idx, (sgid * SUBGROUP_SIZE));
+                uint query_offset_next_seq = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, target_seq_idx + 1, (sgid * SUBGROUP_SIZE));
+                const uint query_pitch = query_offset_next_seq - query_offset;
+#else
+                uint query_offset = INPUT0_GET_INDEX(b0_idx, b1_idx, target_seq_idx, (sgid * SUBGROUP_SIZE));
+                const uint query_pitch = SUBGROUP_SIZE * SUBGROUPS_PER_WG;
+#endif
+                uint query_local_offset = (sgid * SUBGROUP_SIZE + sglid) * TARGET_SEQ_LEN_BLOCK_SIZE;
+                if (target_seq_len_bs != TARGET_SEQ_LEN_BLOCK_SIZE) {
+                    for (uint seq_idx = 0; seq_idx < target_seq_len_bs; seq_idx++) {
+                        INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
+
+                        query_local[query_local_offset] = val;
+                        query_offset += query_pitch;
+                        query_local_offset++;
+                    }
+                } else {
+                    unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                        INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
+
+                        query_local[query_local_offset] = val;
+                        query_offset += query_pitch;
+                        query_local_offset++;
+                    }
+                }
+            }
+
+            {
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            // Main Gemm1 calculation loop
+            uint seq_len = sgid * TARGET_SEQ_LEN_BLOCK_SIZE;
+            for (; seq_len < partition_seq_len; seq_len += SUBGROUPS_PER_WG * SUBGROUP_SIZE) {
+#ifdef INPUT1_DIMS_ORDER
+                uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0);
+                uint key_offset_next_seq = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len + 1, 0);
+                const uint key_pitch = key_offset_next_seq - key_offset;
+#else
+                uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0);
+                const uint key_pitch = HEAD_SIZE;
+#endif
+
+                INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO};
+
+                for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) {
+                    #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset);
+                    #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE)
+
+                    QUERY_VEC queries_vec;
+                    uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid;
+                    unroll_for (uint q_row_idx = 0; q_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; q_row_idx++) {
+                        queries_vec[q_row_idx] = query_local[query_local_offset];
+                        query_local_offset += TARGET_SEQ_LEN_BLOCK_SIZE;
+                    }
+
+                    unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) {
+                        INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index);
+
+                        unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) {
+                            acc[key_row_idx] = mad(sub_group_broadcast(key_vals, i), queries_vec[i], acc[key_row_idx]);
+                        }
+                    }
+                }
+
+                {
+#if !IS_CAUSAL && HAS_ATTN_MASK_INPUT
+                    const uint attn_mask_offset = INPUT3_GET_INDEX_SAFE(b0_idx, b1_idx, target_seq_idx + sglid, start_partition_idx + seq_len);
+                    MAKE_VECTOR_TYPE(INPUT3_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) attn_mask_vec = INPUT3_VAL_MIN;
+                    for (uint i = 0; i < min(partition_seq_len - seq_len, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); i++) {
+                        attn_mask_vec[i] = attn_mask[attn_mask_offset + i];
+                    }
+#endif
+                    unroll_for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) {
+                        acc[i] *= scale_val;
+#if IS_CAUSAL
+                        if (start_partition_idx + seq_len + i > target_seq_idx + sglid)
+                            acc[i] += INPUT0_VAL_MIN;
+#elif !IS_CAUSAL && HAS_ATTN_MASK_INPUT
+                        acc[i] += attn_mask_vec[i];
+#endif
+#if INPUT0_TYPE_SIZE ==  2
+                        /* Adding this clamp improves performance for some reason */
+                        acc[i] = SOFTMAX_ACCUMULATOR_MIN_FUNC(SOFTMAX_ACCUMULATOR_MAX_FUNC(acc[i], INPUT0_VAL_MIN), INPUT0_VAL_MAX);
+#endif
+                        if (seq_len + i >= partition_seq_len) {
+                            acc[i] = INPUT0_VAL_MIN;
+                        }
+
+                        qk_max = SOFTMAX_ACCUMULATOR_MAX_FUNC(qk_max, TO_SOFTMAX_ACCUMULATOR_TYPE(acc[i]));
+                        qk_local[sglid * SEQ_LEN_PARTITION_SIZE + seq_len + i] = acc[i];
+                    }
+                }
+            }
+        } // Gemm1 calculation end
+
+        {
+            // Save QK max to SLM
+            qk_max_vals[sglid * SUBGROUPS_PER_WG + sgid] = qk_max;
+        }
+
+        {
+            // SoftMax calculation
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+            const uint seq_idx_end = target_seq_len_bs;
+#else
+            const uint seq_idx_end = 1;
+#endif
+            #define QK_MAX_NUMS_PER_SG CEIL_DIV(TARGET_SEQ_LEN_BLOCK_SIZE, SUBGROUPS_PER_WG)
+            #if (TARGET_SEQ_LEN_BLOCK_SIZE % SUBGROUPS_PER_WG != 0)
+                /* /* If TARGET_SEQ_LEN_BLOCK_SIZE is not divisible by SUBGROUPS_PER_WG, then some subgroups will have to handle more QK rows than others */
+                #define QK_ITERS_END \
+                    (TARGET_SEQ_LEN_BLOCK_SIZE / SUBGROUPS_PER_WG + (sgid < TARGET_SEQ_LEN_BLOCK_SIZE % SUBGROUPS_PER_WG ? 1 : 0))
+            #else
+                #define QK_ITERS_END QK_MAX_NUMS_PER_SG
+            #endif
+
+            OUTPUT_TYPE qk_max[QK_MAX_NUMS_PER_SG];
+            for (uint i = 0; i < QK_MAX_NUMS_PER_SG; i++)
+                qk_max[i] = SOFTMAX_ACCUMULATOR_VAL_MIN;
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if (sglid < SUBGROUPS_PER_WG)
+                for (uint i = 0; i < QK_ITERS_END; i++)
+                    qk_max[i] = qk_max_vals[(i * SUBGROUPS_PER_WG * SUBGROUPS_PER_WG) + sgid * SUBGROUPS_PER_WG + sglid];
+
+            sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+            for (uint i = 0; i < QK_ITERS_END; i++) {
+                qk_max[i] = sub_group_reduce_max(qk_max[i]);
+            }
+
+            SOFTMAX_ACCUMULATOR_TYPE exp_sum[QK_MAX_NUMS_PER_SG];
+            for (uint i = 0; i < QK_MAX_NUMS_PER_SG; i++)
+                exp_sum[i] = SOFTMAX_ACCUMULATOR_VAL_ZERO;
+
+            for (uint i = 0; i < QK_ITERS_END; i++) {
+                // TODO: Try full loop, with ternary operator
+                for (uint qk_idx = sglid; qk_idx < partition_seq_len; qk_idx += SUBGROUP_SIZE) {
+                    const uint qk_offset = i * SUBGROUPS_PER_WG * SEQ_LEN_PARTITION_SIZE + sgid * SEQ_LEN_PARTITION_SIZE + qk_idx;
+                    SOFTMAX_ACCUMULATOR_TYPE qk_val = qk_local[qk_offset];
+                    SOFTMAX_ACCUMULATOR_TYPE qk_new = native_exp(TO_SOFTMAX_ACCUMULATOR_TYPE(qk_val) - qk_max[i]);
+                    qk_local[qk_offset] = qk_new;
+                    exp_sum[i] += qk_new;
+                }
+            }
+
+            for (uint i = 0; i < QK_ITERS_END; i++) {
+                exp_sum[i] = sub_group_reduce_add(exp_sum[i]);
+            }
+
+            for (uint i = 0; i < QK_ITERS_END; i++) {
+                for (uint qk_idx = sglid; qk_idx < partition_seq_len; qk_idx += SUBGROUP_SIZE) {
+                    const uint qk_offset = i * SUBGROUPS_PER_WG * SEQ_LEN_PARTITION_SIZE + sgid * SEQ_LEN_PARTITION_SIZE + qk_idx;
+                    SOFTMAX_ACCUMULATOR_TYPE qk_val = TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[qk_offset]);
+                    SOFTMAX_ACCUMULATOR_TYPE qk_new = qk_val / exp_sum[i];
+                    qk_local[qk_offset] = qk_new;
+                }
+            }
+
+            {
+                // If the number of partitions is greater than 1, save exm_sums and max_logits to the temporary buffers
+                // Use single WI in the WG, since all the WIs have the same value
+                if (num_of_partitions > 1 && sglid == 0) {
+                    for (uint i = 0; i < QK_MAX_NUMS_PER_SG; i++) {
+                        if (target_seq_idx + sgid + (i * SUBGROUPS_PER_WG) < TARGET_SEQ_LEN) {
+                            const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) +
+                                                        b1_idx * (TARGET_SEQ_LEN * num_of_partitions) +
+                                                        (target_seq_idx + sgid + (i * SUBGROUPS_PER_WG)) * (num_of_partitions) +
+                                                        partition_idx;
+                            exp_sums[exp_sums_offset] = exp_sum[i];
+
+                            const uint max_logits_offset = exp_sums_offset;
+                            max_logits[max_logits_offset] = qk_max[i];
+                        }
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        } // SoftMax calculation end
+    } // Gemm1 + SoftMax calculations end
+
+    const uint seq_len_leftovers_start = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE;
+    if (seq_len_leftovers_start != partition_seq_len) {
+        // Gemm2 calculation
+        OUTPUT_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {OUTPUT_VAL_ZERO};
+
+#ifdef INPUT2_DIMS_ORDER
+        uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0);
+        uint value_offset_next_seq = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0);
+        const uint value_pitch = value_offset_next_seq - value_offset;
+#else
+        const uint value_pitch = HEAD_SIZE;
+#endif
+
+        for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) {
+#ifdef INPUT2_DIMS_ORDER
+            uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx);
+#else
+            uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx);
+#endif
+
+            OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE];
+            unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid];
+            }
+
+            unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) {
+                INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+                unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                    acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]);
+                }
+
+                value_offset += value_pitch;
+            }
+        }
+
+
+        /* The handling of leftovers causes significantly worse assembly code generation for the above main calculation loop.
+           Therefore, there are two independent branches for the calculation of QK*V matrices:
+           one with leftovers handling (when seq_len_leftovers_start != partition_seq_len) and one without. */
+        {
+            OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE];
+            uint qk_offset = min(seq_len_leftovers_start + sglid, partition_seq_len);
+            unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                qk_val[seq_idx] = qk_local[qk_offset];
+                qk_offset += SEQ_LEN_PARTITION_SIZE;
+            }
+
+            uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start, head_size_idx);
+
+            for (uint seq_len_idx = 0; seq_len_idx < partition_seq_len - seq_len_leftovers_start; seq_len_idx++) {
+                INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+
+                for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                    acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], seq_len_idx), value_val, acc[seq_idx]);
+                }
+
+                value_offset += value_pitch;
+            }
+        }
+
+        // If the number of partitions is greater than 1, save results to the temporary buffer;
+        // otherwise, save results directly to the main output.
+        if (num_of_partitions > 1) {
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+            const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+#else
+            const uint seq_idx_end = 1;
+#endif
+            for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                // Data layout of tmp_output buf: [batch, heads_num, q_len, partition_idx, head_size]
+                const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            (target_seq_idx + seq_idx) * (num_of_partitions * HEAD_SIZE) +
+                                            partition_idx * (HEAD_SIZE) +
+                                            head_size_idx;
+
+                tmp_out[tmp_out_offset] = acc[seq_idx];
+            }
+        } else {
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+            const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+#else
+            const uint seq_idx_end = 1;
+#endif
+            for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                    const uint output_offset = OUTPUT_GET_INDEX(b0_idx, b1_idx, target_seq_idx + seq_idx, head_size_idx);
+
+                    output[output_offset] = acc[seq_idx];
+            }
+        }
+    } else {
+        // Gemm2 calculation
+        OUTPUT_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {OUTPUT_VAL_ZERO};
+
+#ifdef INPUT2_DIMS_ORDER
+        uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0);
+        uint value_offset_next_seq = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0);
+        const uint value_pitch = value_offset_next_seq - value_offset;
+#else
+        const uint value_pitch = HEAD_SIZE;
+#endif
+
+        for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) {
+#ifdef INPUT2_DIMS_ORDER
+            uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx);
+#else
+            uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx);
+#endif
+
+            OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE];
+            unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid];
+            }
+
+            unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) {
+                INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+                unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
+                    acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]);
+                }
+
+                value_offset += value_pitch;
+            }
+        }
+
+        // If the number of partitions is greater than 1, save results to the temporary buffer;
+        // otherwise, save results directly to the main output.
+        if (num_of_partitions > 1) {
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+            const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+#else
+            const uint seq_idx_end = 1;
+#endif
+            for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                // Data layout of tmp_output buf: [batch, heads_num, q_len, partition_idx, head_size]
+                const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            (target_seq_idx + seq_idx) * (num_of_partitions * HEAD_SIZE) +
+                                            partition_idx * (HEAD_SIZE) +
+                                            head_size_idx;
+                tmp_out[tmp_out_offset] = acc[seq_idx];
+            }
+        } else {
+#if TARGET_SEQ_LEN_BLOCK_SIZE > 1
+            const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE);
+#else
+            const uint seq_idx_end = 1;
+#endif
+            for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                    const uint output_offset = OUTPUT_GET_INDEX(b0_idx, b1_idx, target_seq_idx + seq_idx, head_size_idx);
+
+                    output[output_offset] = acc[seq_idx];
+            }
+        }
+    } // Gemm2 calculation end
+}
+
+#endif // TARGET_SEQ_LEN_BLOCK_SIZE != 1
+
+#endif  // SDPA_STAGE_0
+
+#ifdef SDPA_STAGE_1
+
+// MTL iGPU faces high register pressure issue with a higher number of REG_VERSION_MAX_VALUES_PER_WI.
+// To mitigate this, add an additional level of SDPA results processing
+// with lower register pressure (REG_VERSION_MAX_VALUES_PER_WI_LOWER).
+
+#if SOFTMAX_ACCUMULATOR_TYPE_SIZE == 4
+#define REG_VERSION_MAX_VALUES_PER_WI 24
+#define REG_VERSION_MAX_VALUES_PER_WI_LOWER 8
+#elif SOFTMAX_ACCUMULATOR_TYPE_SIZE == 2
+#define REG_VERSION_MAX_VALUES_PER_WI 48
+#define REG_VERSION_MAX_VALUES_PER_WI_LOWER 16
+#else
+#error Unexpected SOFTMAX_ACCUMULATOR data type size
+#endif
+
+// query_input   [batch, heads_num, q_len, head_size]
+// key_input     [batch, kv_heads_num, kv_len, head_size]
+// value_input   [batch, kv_heads_num, kv_len, head_size]
+// attn_mask     [1, 1, q_len, kv_len]
+// output        [batch, heads_num, q_len, head_size]
+// exp_sums      [batch, heads_num, q_len, partition_idx]
+// max_logits    [batch, heads_num, q_len, partition_idx]
+// tmp_out       [batch, heads_num, q_len, partition_idx, head_size]
+
+REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE)
+KERNEL(sdpa_opt_finalization_stage)(
+    OPTIONAL_SHAPE_INFO_ARG
+    __global OUTPUT_TYPE* output,
+    const __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums,
+    const __global SOFTMAX_ACCUMULATOR_TYPE* max_logits,
+    const __global OUTPUT_TYPE* tmp_out,
+    const uint num_of_partitions) {
+    const uint batch_idx = get_global_id(0);
+    const uint b0_idx = batch_idx / NUM_HEADS;
+    const uint b1_idx = batch_idx % NUM_HEADS;
+    const uint target_seq_idx = get_global_id(1);
+    const uint sglid = get_sub_group_local_id();
+
+    if (num_of_partitions <= SUBGROUP_SIZE * REG_VERSION_MAX_VALUES_PER_WI_LOWER) {
+        /* Registers kernel version, can handle up to SEQ_LEN_PARTITION_SIZE(256) * SUBGROUP_SIZE(16) * REG_VERSION_MAX_VALUES_PER_WI_LOWER(8/16) = 32768/65536 tokens */
+        SOFTMAX_ACCUMULATOR_TYPE exp_sum[REG_VERSION_MAX_VALUES_PER_WI_LOWER] = {SOFTMAX_ACCUMULATOR_VAL_ZERO};
+        SOFTMAX_ACCUMULATOR_TYPE max_logit[REG_VERSION_MAX_VALUES_PER_WI_LOWER] = {SOFTMAX_ACCUMULATOR_VAL_MIN};
+        SOFTMAX_ACCUMULATOR_TYPE local_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO;
+        SOFTMAX_ACCUMULATOR_TYPE local_max_logit = SOFTMAX_ACCUMULATOR_VAL_MIN;
+
+        const uint iters_num = CEIL_DIV(num_of_partitions, SUBGROUP_SIZE);
+        for (uint i = 0; i < iters_num; i++) {
+            const uint partition_idx = i * SUBGROUP_SIZE + sglid;
+            const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) +
+                                         b1_idx * (TARGET_SEQ_LEN * num_of_partitions) +
+                                         target_seq_idx * (num_of_partitions) +
+                                         partition_idx;
+            const uint max_logit_offset = exp_sums_offset;
+
+            if (partition_idx < num_of_partitions) {
+                exp_sum[i] = exp_sums[exp_sums_offset];
+                max_logit[i] = max_logits[max_logit_offset];
+                local_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(local_max_logit, max_logit[i]);
+            }
+        }
+
+        SOFTMAX_ACCUMULATOR_TYPE global_max = sub_group_reduce_max(local_max_logit);
+
+        // Update exp_sum with respect to the global maximum
+        for (uint i = 0; i < iters_num; i++) {
+            const uint partition_idx = i * SUBGROUP_SIZE + sglid;
+            if (partition_idx < num_of_partitions) {
+                exp_sum[i] = exp_sum[i] * native_exp(max_logit[i] - global_max);
+                local_exp_sum += exp_sum[i];
+            }
+        }
+
+        SOFTMAX_ACCUMULATOR_TYPE global_sum = sub_group_reduce_add(local_exp_sum);
+
+        for (uint head_size_idx = 0; head_size_idx < HEAD_SIZE / SUBGROUP_SIZE; head_size_idx++) {
+            SOFTMAX_ACCUMULATOR_TYPE acc = 0.0f;
+            for (uint partition_idx = 0; partition_idx < num_of_partitions; partition_idx++) {
+                const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            target_seq_idx * (num_of_partitions * HEAD_SIZE) +
+                                            partition_idx * (HEAD_SIZE) +
+                                            (head_size_idx * SUBGROUP_SIZE + sglid);
+                OUTPUT_TYPE out_val = tmp_out[tmp_out_offset];
+                acc += TO_SOFTMAX_ACCUMULATOR_TYPE(out_val) *
+                    TO_SOFTMAX_ACCUMULATOR_TYPE(sub_group_broadcast(exp_sum[partition_idx / SUBGROUP_SIZE], partition_idx % SUBGROUP_SIZE)) /
+                    TO_SOFTMAX_ACCUMULATOR_TYPE(global_sum);
+            }
+            const uint out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * HEAD_SIZE) +
+                                    b1_idx * (TARGET_SEQ_LEN * HEAD_SIZE) +
+                                    target_seq_idx * (HEAD_SIZE) +
+                                    (head_size_idx * SUBGROUP_SIZE + sglid);
+
+            output[out_offset] = TO_OUTPUT_TYPE(acc);
+        }
+    } else if (num_of_partitions <= SUBGROUP_SIZE * REG_VERSION_MAX_VALUES_PER_WI) {
+        /* Registers kernel version, can handle up to SEQ_LEN_PARTITION_SIZE(256) * SUBGROUP_SIZE(16) * REG_VERSION_MAX_VALUES_PER_WI(24/48) = 98304/196608 tokens */
+        SOFTMAX_ACCUMULATOR_TYPE exp_sum[REG_VERSION_MAX_VALUES_PER_WI] = {SOFTMAX_ACCUMULATOR_VAL_ZERO};
+        SOFTMAX_ACCUMULATOR_TYPE max_logit[REG_VERSION_MAX_VALUES_PER_WI] = {SOFTMAX_ACCUMULATOR_VAL_MIN};
+        SOFTMAX_ACCUMULATOR_TYPE local_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO;
+        SOFTMAX_ACCUMULATOR_TYPE local_max_logit = SOFTMAX_ACCUMULATOR_VAL_MIN;
+
+        const uint iters_num = CEIL_DIV(num_of_partitions, SUBGROUP_SIZE);
+        for (uint i = 0; i < iters_num; i++) {
+            const uint partition_idx = i * SUBGROUP_SIZE + sglid;
+            const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) +
+                                         b1_idx * (TARGET_SEQ_LEN * num_of_partitions) +
+                                         target_seq_idx * (num_of_partitions) +
+                                         partition_idx;
+            const uint max_logit_offset = exp_sums_offset;
+
+            if (partition_idx < num_of_partitions) {
+                exp_sum[i] = exp_sums[exp_sums_offset];
+                max_logit[i] = max_logits[max_logit_offset];
+                local_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(local_max_logit, max_logit[i]);
+            }
+        }
+
+        SOFTMAX_ACCUMULATOR_TYPE global_max = sub_group_reduce_max(local_max_logit);
+
+        // Update exp_sum with respect to the global maximum
+        for (uint i = 0; i < iters_num; i++) {
+            const uint partition_idx = i * SUBGROUP_SIZE + sglid;
+            if (partition_idx < num_of_partitions) {
+                exp_sum[i] = exp_sum[i] * native_exp(max_logit[i] - global_max);
+                local_exp_sum += exp_sum[i];
+            }
+        }
+
+        SOFTMAX_ACCUMULATOR_TYPE global_sum = sub_group_reduce_add(local_exp_sum);
+
+        for (uint head_size_idx = 0; head_size_idx < HEAD_SIZE / SUBGROUP_SIZE; head_size_idx++) {
+            SOFTMAX_ACCUMULATOR_TYPE acc = 0.0f;
+            for (uint partition_idx = 0; partition_idx < num_of_partitions; partition_idx++) {
+                const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            target_seq_idx * (num_of_partitions * HEAD_SIZE) +
+                                            partition_idx * (HEAD_SIZE) +
+                                            (head_size_idx * SUBGROUP_SIZE + sglid);
+                OUTPUT_TYPE out_val = tmp_out[tmp_out_offset];
+                acc += TO_SOFTMAX_ACCUMULATOR_TYPE(out_val) *
+                    TO_SOFTMAX_ACCUMULATOR_TYPE(sub_group_broadcast(exp_sum[partition_idx / SUBGROUP_SIZE], partition_idx % SUBGROUP_SIZE)) /
+                    TO_SOFTMAX_ACCUMULATOR_TYPE(global_sum);
+            }
+            const uint out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * HEAD_SIZE) +
+                                    b1_idx * (TARGET_SEQ_LEN * HEAD_SIZE) +
+                                    target_seq_idx * (HEAD_SIZE) +
+                                    (head_size_idx * SUBGROUP_SIZE + sglid);
+
+            output[out_offset] = TO_OUTPUT_TYPE(acc);
+        }
+    } else {
+        /* Global memory kernel version, can handle any number of tokens, but could be very slow. */
+        SOFTMAX_ACCUMULATOR_TYPE local_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO;
+        SOFTMAX_ACCUMULATOR_TYPE local_max_logit = SOFTMAX_ACCUMULATOR_VAL_MIN;
+
+        const uint iters_num = CEIL_DIV(num_of_partitions, SUBGROUP_SIZE);
+        for (uint i = 0; i < iters_num; i++) {
+            const uint partition_idx = i * SUBGROUP_SIZE + sglid;
+            const uint max_logit_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) +
+                                          b1_idx * (TARGET_SEQ_LEN * num_of_partitions) +
+                                          target_seq_idx * (num_of_partitions) +
+                                          partition_idx;
+
+
+            if (partition_idx < num_of_partitions) {
+                local_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(local_max_logit, max_logits[max_logit_offset]);
+            }
+        }
+
+        SOFTMAX_ACCUMULATOR_TYPE global_max = sub_group_reduce_max(local_max_logit);
+
+        // Calculate global sum
+        for (uint i = 0; i < iters_num; i++) {
+            const uint partition_idx = i * SUBGROUP_SIZE + sglid;
+            const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) +
+                                         b1_idx * (TARGET_SEQ_LEN * num_of_partitions) +
+                                         target_seq_idx * (num_of_partitions) +
+                                         partition_idx;
+            const uint max_logit_offset = exp_sums_offset;
+
+            if (partition_idx < num_of_partitions) {
+                local_exp_sum += exp_sums[exp_sums_offset] * native_exp(max_logits[max_logit_offset] - global_max);
+            }
+        }
+
+        SOFTMAX_ACCUMULATOR_TYPE global_sum = sub_group_reduce_add(local_exp_sum);
+
+        for (uint head_size_idx = 0; head_size_idx < HEAD_SIZE / SUBGROUP_SIZE; head_size_idx++) {
+            SOFTMAX_ACCUMULATOR_TYPE acc = 0.0f;
+            for (uint partition_idx = 0; partition_idx < num_of_partitions; partition_idx++) {
+                const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) +
+                                            target_seq_idx * (num_of_partitions * HEAD_SIZE) +
+                                            partition_idx * (HEAD_SIZE) +
+                                            (head_size_idx * SUBGROUP_SIZE + sglid);
+
+                const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) +
+                                            b1_idx * (TARGET_SEQ_LEN * num_of_partitions) +
+                                            target_seq_idx * (num_of_partitions) +
+                                            partition_idx;
+                const uint max_logit_offset = exp_sums_offset;
+
+                SOFTMAX_ACCUMULATOR_TYPE new_exp_sum = exp_sums[exp_sums_offset] * native_exp(max_logits[max_logit_offset] - global_max);
+
+                OUTPUT_TYPE out_val = tmp_out[tmp_out_offset];
+                acc += TO_SOFTMAX_ACCUMULATOR_TYPE(out_val) * new_exp_sum / TO_SOFTMAX_ACCUMULATOR_TYPE(global_sum);
+            }
+
+            const uint out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * HEAD_SIZE) +
+                                    b1_idx * (TARGET_SEQ_LEN * HEAD_SIZE) +
+                                    target_seq_idx * (HEAD_SIZE) +
+                                    (head_size_idx * SUBGROUP_SIZE + sglid);
+
+            output[out_offset] = TO_OUTPUT_TYPE(acc);
+        }
+    }
+}
+
+#endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl
new file mode 100644
index 00000000000000..cd289be026e7e3
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl
@@ -0,0 +1,212 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+
+// query_input   [batch, heads_num, q_len, head_size]
+// key_input     [batch, kv_heads_num, kv_len, head_size]
+// value_input   [batch, kv_heads_num, kv_len, head_size]
+// attn_mask     [1, 1, q_len, kv_len]
+// output        [batch, heads_num, q_len, head_size]
+// tmp_buf       [batch, heads_num, q_len, kv_len]
+
+// When handling long sequences and executing in FP16, accuracy can significantly vary based on two factors:
+// 1) The order of scale application (which can be controlled using the APPLY_SCALE_TO_QUERY macro)
+// 2) The type of SoftMax accumulator
+
+inline uint FUNC(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#if INPUT0_SIMPLE
+    return GET_DATA_INDEX_6D_SAFE(INPUT0, b, f, w, z, y, x);
+#else
+#if INPUT0_DIMS == 4
+    return INPUT0_GET_INDEX_SAFE(b, f, y, x);
+#elif INPUT0_DIMS == 5
+    return INPUT0_GET_INDEX_SAFE(b, f, z, y, x);
+#elif INPUT0_DIMS == 6
+    return INPUT0_GET_INDEX_SAFE(b, f, w, z, y, x);
+#else
+#   error sdpa_ref.cl : Unsupported input 0 format
+#endif
+#endif
+}
+
+inline uint FUNC(get_input0_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef INPUT0_DIMS_ORDER
+    return FUNC_CALL(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT0_DIMS_ORDER);
+#else
+    return FUNC_CALL(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x);
+#endif
+}
+
+inline uint FUNC(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef DO_BROADCAST_KEY_VALUE
+    DO_BROADCAST_KEY_VALUE;
+#endif
+#if INPUT1_SIMPLE
+    return GET_DATA_INDEX_6D_SAFE(INPUT1, b, f, w, z, y, x);
+#else
+#if INPUT1_DIMS == 4
+    return INPUT1_GET_INDEX_SAFE(b, f, y, x);
+#elif INPUT1_DIMS == 5
+    return INPUT1_GET_INDEX_SAFE(b, f, z, y, x);
+#elif INPUT1_DIMS == 6
+    return INPUT1_GET_INDEX_SAFE(b, f, w, z, y, x);
+#else
+#   error sdpa_ref.cl : Unsupported input 1 format
+#endif
+#endif
+}
+
+inline uint FUNC(get_input1_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef INPUT1_DIMS_ORDER
+    return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT1_DIMS_ORDER);
+#else
+    return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x);
+#endif
+}
+
+inline uint FUNC(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef DO_BROADCAST_KEY_VALUE
+    DO_BROADCAST_KEY_VALUE;
+#endif
+#if INPUT2_SIMPLE
+    return GET_DATA_INDEX_6D_SAFE(INPUT2, b, f, w, z, y, x);
+#else
+#if INPUT2_DIMS == 4
+    return INPUT2_GET_INDEX_SAFE(b, f, y, x);
+#elif INPUT2_DIMS == 5
+    return INPUT2_GET_INDEX_SAFE(b, f, z, y, x);
+#elif INPUT2_DIMS == 6
+    return INPUT2_GET_INDEX_SAFE(b, f, w, z, y, x);
+#else
+#   error sdpa_ref.cl : Unsupported input 1 format
+#endif
+#endif
+}
+
+inline uint FUNC(get_input2_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
+#ifdef INPUT2_DIMS_ORDER
+    return FUNC_CALL(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT2_DIMS_ORDER);
+#else
+    return FUNC_CALL(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x);
+#endif
+}
+
+#define APPLY_SCALE_TO_QUERY 1
+
+KERNEL(sdpa_ref)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* query_input,
+    const __global INPUT1_TYPE* key_input,
+    const __global INPUT2_TYPE* value_input,
+#if HAS_ATTN_MASK_INPUT
+    const __global INPUT3_TYPE* attn_mask,
+#endif
+#if HAS_SCALE_INPUT
+    const __global INPUT4_TYPE* scale,
+#endif
+    __global OUTPUT_TYPE* output,
+    __global OUTPUT_TYPE* tmp_buf
+)
+{
+    const uint batch_idx = get_global_id(0);
+    const uint b0 = batch_idx / NUM_HEADS; /* BATCH dim */
+    const uint b1 = batch_idx % NUM_HEADS; /* HEADS_NUM dim */
+    const uint target_seq_idx = get_global_id(1);
+    const uint head_size_idx = get_global_id(2);
+
+#if HAS_SCALE_INPUT
+    const OUTPUT_TYPE scale_val = *scale;
+#else
+    const OUTPUT_TYPE scale_val = OUTPUT_VAL_ONE / sqrt(TO_OUTPUT_TYPE(INPUT1_SIZE_X));
+#endif
+
+    // Process 1*seq_len elements (Gemm1 + SoftMax) using a single work item, saving results to tmp_buf and
+    // reusing them between all work items within a single workgroup for Gemm2 calculations.
+    if (get_local_id(2) == 0) {
+        for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) {
+            OUTPUT_TYPE acc = 0;
+            for (uint h = 0; h < HEAD_SIZE /* head_size */; h++) {
+                uint query_offset = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0, b1, 0, 0, target_seq_idx, h);
+                uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0, b1, 0, 0, s, h);
+
+#if APPLY_SCALE_TO_QUERY
+                INPUT0_TYPE q_val = query_input[query_offset] * scale_val;
+#else
+                INPUT0_TYPE q_val = query_input[query_offset];
+#endif
+                INPUT1_TYPE k_val = key_input[key_offset];
+                acc += q_val * k_val;
+            }
+
+#if !APPLY_SCALE_TO_QUERY
+            acc *= scale_val;
+#endif
+
+            uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                                  b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                                  target_seq_idx * (SOURCE_SEQ_LEN) + s;
+            tmp_buf[tmp_buf_offset] = acc;
+        }
+
+        ACCUMULATOR_TYPE qk_max = ACCUMULATOR_VAL_MIN;
+        for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) {
+            uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                                  b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                                  target_seq_idx * (SOURCE_SEQ_LEN) + s;
+#if IS_CAUSAL
+            OUTPUT_TYPE attn_mask_val = s > target_seq_idx ? OUTPUT_VAL_MIN : 0;
+#elif !IS_CAUSAL && HAS_ATTN_MASK_INPUT
+            uint attn_mask_offset = INPUT3_GET_INDEX_SAFE(b0, b1, target_seq_idx, s);
+            OUTPUT_TYPE attn_mask_val = attn_mask[attn_mask_offset];
+#else
+            OUTPUT_TYPE attn_mask_val = OUTPUT_VAL_ZERO;
+#endif
+
+            OUTPUT_TYPE qk_val = tmp_buf[tmp_buf_offset] + attn_mask_val;
+            tmp_buf[tmp_buf_offset] = qk_val;
+
+            qk_max = ACCUMULATOR_MAX_FUNC(qk_max, TO_ACCUMULATOR_TYPE(qk_val));
+        }
+
+        ACCUMULATOR_TYPE exp_sum = ACCUMULATOR_VAL_ZERO;
+        for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) {
+            uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                                  b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                                  target_seq_idx * (SOURCE_SEQ_LEN) + s;
+
+            OUTPUT_TYPE qk_val = tmp_buf[tmp_buf_offset];
+            ACCUMULATOR_TYPE val = native_exp(TO_ACCUMULATOR_TYPE(qk_val) - qk_max);
+            exp_sum += val;
+
+            tmp_buf[tmp_buf_offset] = TO_OUTPUT_TYPE(val);
+        }
+
+        const ACCUMULATOR_TYPE inv_sum = ACCUMULATOR_VAL_ONE / exp_sum;
+        for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) {
+            uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                                  b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                                  target_seq_idx * (SOURCE_SEQ_LEN) + s;
+
+            OUTPUT_TYPE qk_val = tmp_buf[tmp_buf_offset];
+            ACCUMULATOR_TYPE val = TO_ACCUMULATOR_TYPE(qk_val) * inv_sum;
+            tmp_buf[tmp_buf_offset] = TO_OUTPUT_TYPE(val);
+        }
+    }
+
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
+    OUTPUT_TYPE acc = 0;
+    for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) {
+        uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                              b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) +
+                              target_seq_idx * (SOURCE_SEQ_LEN) + s;
+        uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0, b1, 0, 0, s, head_size_idx);
+
+        acc += tmp_buf[tmp_buf_offset] * value_input[value_offset];
+    }
+
+    uint output_offset = OUTPUT_GET_INDEX(b0, b1, target_seq_idx, head_size_idx);
+    output[output_offset] = acc;
+}
diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
index c2a4ef1653472d..768a0fc3c4f854 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
@@ -59,6 +59,7 @@ enum class KernelType {
     DEPTH_TO_SPACE,
     BATCH_TO_SPACE,
     SHAPE_OF,
+    SDPA,
     SHUFFLE_CHANNELS,
     SLICE,
     STRIDED_SLICE,
diff --git a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
index 084ae71e42732c..fcd35d13a3639b 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
@@ -326,8 +326,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
     JitDefinitions baseDefinitions = TensorBaseTJitConstant::GetDefinitions(_tensor);
 
     JitDefinitions definitions{};
-    DimensionAccessHelper dims(_tensor);
-    DimensionAccessHelper dims_padded(_tensor, true);
+    DimensionAccessHelperJit dims(_tensor);
+    DimensionAccessHelperJit dims_padded(_tensor, true);
     // shape_info layout
     // if only y has dynamic padding:
     // [dim_b, dim_f, dim_v, dim_u, dim_w, dim_z, dim_y, dim_x, pad_before_y, pad_after_y]
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h
index e3e5f3dcc47a2d..2c8256b8551b89 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h
@@ -11,9 +11,9 @@ namespace kernel_selector {
 struct weight_bias_params;
 struct WeightsReorderParams;
 
-struct DimensionAccessHelper {
-    explicit DimensionAccessHelper(const DataTensor& t, bool padded = false) {
-        std::vector<Tensor::Dim> dims = {
+struct DimensionAccessHelperBase {
+    explicit DimensionAccessHelperBase(const DataTensor& t) {
+        dims = {
             t.Batch(),
             t.Feature(),
             t.U(),
@@ -23,6 +23,23 @@ struct DimensionAccessHelper {
             t.Y(),
             t.X(),
         };
+    }
+
+    Tensor::Dim& x_dim() { return dims[7]; }
+    Tensor::Dim& y_dim() { return dims[6]; }
+    Tensor::Dim& z_dim() { return dims[5]; }
+    Tensor::Dim& w_dim() { return dims[4]; }
+    Tensor::Dim& v_dim() { return dims[3]; }
+    Tensor::Dim& u_dim() { return dims[2]; }
+    Tensor::Dim& f_dim() { return dims[1]; }
+    Tensor::Dim& b_dim() { return dims[0]; }
+
+    std::vector<Tensor::Dim> dims;
+};
+
+struct DimensionAccessHelperJit : virtual DimensionAccessHelperBase {
+    explicit DimensionAccessHelperJit(const DataTensor& t, bool padded = false)
+    : DimensionAccessHelperBase(t) {
         size_t dyn_shape_offset = t.get_dynamic_shape_offset();
         size_t dyn_pad_offset = dyn_shape_offset + DataTensor::max_rank();
         for (auto d : dims) {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp
index 2d878e4a9f28e1..ecb6be6f17020d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp
@@ -22,7 +22,7 @@ size_t getOperationNumber(const arg_max_min_params& params) {
 
 std::string getOperationNumberString(const arg_max_min_params& params) {
     const auto& output = params.outputs[0];
-    DimensionAccessHelper dims(output);
+    DimensionAccessHelperJit dims(output);
     switch (params.argMaxMinAxis) {
         case ArgMaxMinAxis::BATCH: return toVectorMulString({dims.x(), dims.y(), dims.z(), dims.f()});
         case ArgMaxMinAxis::FEATURE: return toVectorMulString({dims.x(), dims.y(), dims.z(), dims.b()});
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp
index 07734e85b9dd4a..cdd8d7fc56e39e 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp
@@ -15,7 +15,7 @@ JitConstants FullyConnectedKernelBase::GetJitConstants(const fully_connected_par
     JitConstants jit = WeightBiasKernelBase::GetJitConstants(params);
     const auto& input = params.inputs[0];
     if (input.is_dynamic()) {
-        DimensionAccessHelper dims(input);
+        DimensionAccessHelperJit dims(input);
         jit.AddConstant(MakeJitConstant("INPUT0_ELEMENTS_COUNT", toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f()})));
     } else {
         const auto x_size = input.LogicalSize() / input.Batch().v;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
index 2e804085939732..e59f424e5d6af7 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
@@ -135,10 +135,10 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
 
     jit.Merge(MakeTypeJitConstants(params.inputs[0].GetDType(), "ACCUMULATOR"));
     if (params.has_dynamic_tensors()) {
-        DimensionAccessHelper dims0(params.inputs[0]);
-        DimensionAccessHelper dims1(params.inputs[1]);
-        DimensionAccessHelper dims0_padded(params.inputs[0], true);
-        DimensionAccessHelper dims1_padded(params.inputs[1], true);
+        DimensionAccessHelperJit dims0(params.inputs[0]);
+        DimensionAccessHelperJit dims1(params.inputs[1]);
+        DimensionAccessHelperJit dims0_padded(params.inputs[0], true);
+        DimensionAccessHelperJit dims1_padded(params.inputs[1], true);
         // Note: Actually currently this kernel is not being selected if it is shape agnostic impl && transposed inputs
         // Because we cannot get the original rank
         auto input0_dims = ConvTo8dims(params.input0_order);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/mvn/mvn_kernel_bfyx_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/mvn/mvn_kernel_bfyx_opt.cpp
index 806bb90ba67b43..923bd98814a46f 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/mvn/mvn_kernel_bfyx_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/mvn/mvn_kernel_bfyx_opt.cpp
@@ -80,7 +80,7 @@ JitConstants MVNKernelBfyxOpt::GetJitConstants(const mvn_params& params, MVNKern
 
     if (params.has_dynamic_tensors()) {
         const auto& input = params.inputs[0];
-        DimensionAccessHelper dims(input);
+        DimensionAccessHelperJit dims(input);
         std::string data_set_size;
         std::string data_set_count;
         if (params.mvnMode == MVNMode::WITHIN_CHANNELS) {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/count_nonzero_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/count_nonzero_kernel_ref.cpp
index d3132e4357fa07..7e6c1397b988e4 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/count_nonzero_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/count_nonzero_kernel_ref.cpp
@@ -76,7 +76,7 @@ KernelsData CountNonzeroKernelRef::GetKernelsData(const Params& params) const {
     auto cldnn_jit = MakeBaseParamsJitConstants(newParams);
     if (newParams.has_dynamic_tensors()) {
         const auto& input = newParams.inputs[0];
-        DimensionAccessHelper dims(input);
+        DimensionAccessHelperJit dims(input);
         const std::string total_data_size = toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()});
         cldnn_jit.AddConstants({MakeJitConstant("DATA_SIZE", total_data_size)});
     } else {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/gather_nonzero_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/gather_nonzero_kernel_ref.cpp
index 0672566e0ed2ad..bac2237893bef3 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/gather_nonzero_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/gather_nonzero_kernel_ref.cpp
@@ -46,7 +46,7 @@ JitConstants GatherNonzeroKernelRef::GetJitConstants(const gather_nonzero_params
     jit.AddConstant(MakeJitConstant("MAX_LOCAL_MEM_SIZE", max_local_mem_size));
 
     if (input.is_dynamic()) {
-        DimensionAccessHelper dims(input);
+        DimensionAccessHelperJit dims(input);
         const std::string total_data_size = toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()});
         jit.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", total_data_size));
     } else {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_tile_8x8_4x4.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_tile_8x8_4x4.cpp
index 13eb399ef8ef4d..06ee5a2bc4b6ef 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_tile_8x8_4x4.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_tile_8x8_4x4.cpp
@@ -107,7 +107,7 @@ static inline std::string GetTiledOutputOrder(const permute_params& params) {
         std::string out_z_str = "";
         const auto& output = params.outputs[0];
         if (params.has_dynamic_outputs()) {
-            DimensionAccessHelper dims(output);
+            DimensionAccessHelperJit dims(output);
             out_y_str = dims.y();
             out_z_str = dims.z();
         } else {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp
index 80e16939bab248..318daac3b5b30e 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp
@@ -30,7 +30,7 @@ JitConstants ReduceKernelBase::GetJitConstants(const reduce_params& params) cons
 
     const auto& output = params.outputs[0];
     if (output.is_dynamic()) {
-        DimensionAccessHelper dims(output);
+        DimensionAccessHelperJit dims(output);
         jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", toVectorMulString({dims.x(),
                                                                                               dims.y(),
                                                                                               dims.z(),
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp
index db5e8c6beb1588..15043ef2624053 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp
@@ -30,7 +30,7 @@ JitConstants RMSKernelBfyxOpt::GetJitConstants(const rms_params& params, Dispatc
 
     if (params.has_dynamic_tensors()) {
         const auto& input = params.inputs[0];
-        DimensionAccessHelper dims(input);
+        DimensionAccessHelperJit dims(input);
         std::string data_size;
         switch (params.ov_input_rank) {
             case 1 :
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp
new file mode 100644
index 00000000000000..61028ef5348a1a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp
@@ -0,0 +1,125 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "sdpa_kernel_base.h"
+#include "kernel_selector_utils.h"
+
+namespace kernel_selector {
+
+static std::string GetDimsOrder(const std::vector<int64_t>& order_idx) {
+    auto get_order_idx = [](std::vector<int64_t> order_idx, int64_t dim_idx) {
+        int loc = 0;
+        for (auto idx : order_idx) {
+            if (idx == dim_idx)
+                break;
+            loc += 1;
+        }
+        return loc;
+    };
+
+    std::string dims_order = "";
+    if (order_idx.size() == 2) {
+        const std::vector<std::string> dims2 = {"y", "x"};
+        dims_order = "b,f,w,z,"
+                    + dims2[get_order_idx(order_idx, 0)] + "," + dims2[get_order_idx(order_idx, 1)];
+    } else if (order_idx.size() == 3) {
+        const std::vector<std::string> dims3 = {"f", "y", "x"};
+        dims_order = "b," + dims3[get_order_idx(order_idx, 0)] + ",w,z,"
+                    + dims3[get_order_idx(order_idx, 1)] + "," + dims3[get_order_idx(order_idx, 2)];
+    } else if (order_idx.size() == 4) {
+        const std::vector<std::string> dims4 = {"b", "f", "y", "x"};
+        dims_order = dims4[get_order_idx(order_idx, 0)] + "," + dims4[get_order_idx(order_idx, 1)] + ",w,z,"
+                    + dims4[get_order_idx(order_idx, 2)] + "," + dims4[get_order_idx(order_idx, 3)];
+    } else if (order_idx.size() == 5) {
+        const std::vector<std::string> dims5 = {"b", "f", "z", "y", "x"};
+        dims_order = dims5[get_order_idx(order_idx, 0)] + "," + dims5[get_order_idx(order_idx, 1)] + ",w,"
+                    + dims5[get_order_idx(order_idx, 2)] + "," + dims5[get_order_idx(order_idx, 3)] + ","
+                    + dims5[get_order_idx(order_idx, 4)];
+    } else if (order_idx.size() == 6) {
+        const std::vector<std::string> dims6 = {"b", "f", "w", "z", "y", "x"};
+        dims_order = dims6[get_order_idx(order_idx, 0)] + "," + dims6[get_order_idx(order_idx, 1)] + ","
+                    + dims6[get_order_idx(order_idx, 2)] + "," + dims6[get_order_idx(order_idx, 3)] + ","
+                    + dims6[get_order_idx(order_idx, 4)] + "," + dims6[get_order_idx(order_idx, 5)];
+    } else {
+        dims_order = "b,f,w,z,y,x";
+    }
+    return dims_order;
+}
+
+static std::string GetBroadcastInputStr(const size_t input_rank, const int64_t axes, const int64_t val) {
+    std::vector<std::string> dims;
+    if (input_rank == 1) {
+        dims = {"x"};
+    } else if (input_rank == 2) {
+        dims = {"y", "x"};
+    } else if (input_rank == 3) {
+        dims = {"f", "y", "x"};
+    } else if (input_rank == 4) {
+        dims = {"b", "f", "y", "x"};
+    } else if (input_rank == 5) {
+        dims = {"b", "f", "z", "y", "x"};
+    } else if (input_rank == 6) {
+        dims = {"b", "f", "w", "z", "y", "x"};
+    }
+    return dims[axes] + " /= " + std::to_string(val) + ";";
+}
+
+JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const {
+    auto jit = MakeBaseParamsJitConstants(params);
+
+    if (params.conf.broadcast_axis != -1) {
+        jit.AddConstant(MakeJitConstant("DO_BROADCAST_KEY_VALUE", GetBroadcastInputStr(params.inputs[0].GetDims().size(),
+                                                                                       params.conf.broadcast_axis,
+                                                                                       params.conf.group_size)));
+    }
+
+    jit.AddConstant(MakeJitConstant("IS_CAUSAL", params.conf.is_causal));
+    jit.AddConstant(MakeJitConstant("HAS_ATTN_MASK_INPUT", params.inputs.size() > 3));
+    jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", params.inputs.size() > 4));
+
+    auto is_default_order = [](const std::vector<int64_t>& order) {
+        for (size_t i = 0; i < order.size(); i++)
+            if (order[i] != static_cast<int64_t>(i))
+                return false;
+        return true;
+    };
+
+    if ((!params.input0_order.empty() && !is_default_order(params.input0_order)) || params.conf.broadcast_axis != -1) {
+        jit.AddConstant(MakeJitConstant("INPUT0_DIMS_ORDER", GetDimsOrder(params.input0_order)));
+    }
+    if ((!params.input1_order.empty() && !is_default_order(params.input1_order)) || params.conf.broadcast_axis != -1) {
+        jit.AddConstant(MakeJitConstant("INPUT1_DIMS_ORDER", GetDimsOrder(params.input1_order)));
+    }
+    if ((!params.input2_order.empty() && !is_default_order(params.input2_order)) || params.conf.broadcast_axis != -1) {
+        jit.AddConstant(MakeJitConstant("INPUT2_DIMS_ORDER", GetDimsOrder(params.input2_order)));
+    }
+
+    TransposedDimensionAccessHelperJit dims_q(params.inputs[0], params.input0_order);
+    jit.AddConstant(MakeJitConstant("TARGET_SEQ_LEN", dims_q.y()));
+    jit.AddConstant(MakeJitConstant("NUM_HEADS", dims_q.f()));
+
+    TransposedDimensionAccessHelperJit dims_k(params.inputs[1], params.input1_order);
+    jit.AddConstant(MakeJitConstant("SOURCE_SEQ_LEN", dims_k.y()));
+
+    return jit;
+}
+
+bool SDPAKernelBase::Validate(const Params& p) const {
+    if (p.GetType() != KernelType::SDPA) {
+        return false;
+    }
+
+    const sdpa_params& params = static_cast<const sdpa_params&>(p);
+
+    for (size_t i = 0; i < params.inputs.size(); i++) {
+        if (params.inputs[i].Dimentions() != 4)
+            return false;
+    }
+
+    if (params.outputs[0].Dimentions() != 4)
+        return false;
+
+    return true;
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h
new file mode 100644
index 00000000000000..1d4f30512df06b
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h
@@ -0,0 +1,124 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_base_opencl.h"
+#include "kernel_selector_params.h"
+#include "kernel_selector_utils.h"
+#include <vector>
+
+namespace kernel_selector {
+struct TransposedDimensionAccessHelperBase : virtual DimensionAccessHelperBase {
+    explicit TransposedDimensionAccessHelperBase(const DataTensor& t, std::vector<int64_t> order)
+    : DimensionAccessHelperBase(t) {
+        size_t total_dims_count = dims.size();
+        size_t new_axis_count = total_dims_count - order.size();
+
+        transposed_order.resize(total_dims_count);
+        std::iota(transposed_order.begin(), transposed_order.end(), 0);
+        for (size_t i = 0; i < order.size(); i++) {
+            size_t transposed_order_pos = i < 2 ? i : i + new_axis_count;
+            transposed_order[transposed_order_pos] = order[i] < 2 ? order[i] : order[i] + new_axis_count;
+        }
+    }
+
+    Tensor::Dim& x_dim() { return dims[transposed_order[7]]; }
+    Tensor::Dim& y_dim() { return dims[transposed_order[6]]; }
+    Tensor::Dim& z_dim() { return dims[transposed_order[5]]; }
+    Tensor::Dim& w_dim() { return dims[transposed_order[4]]; }
+    Tensor::Dim& v_dim() { return dims[transposed_order[3]]; }
+    Tensor::Dim& u_dim() { return dims[transposed_order[2]]; }
+    Tensor::Dim& f_dim() { return dims[transposed_order[1]]; }
+    Tensor::Dim& b_dim() { return dims[transposed_order[0]]; }
+
+    std::vector<int64_t> transposed_order;
+};
+
+struct TransposedDimensionAccessHelperJit : DimensionAccessHelperJit, TransposedDimensionAccessHelperBase {
+    explicit TransposedDimensionAccessHelperJit(const DataTensor& t, std::vector<int64_t> order, bool padded = false)
+    : DimensionAccessHelperBase(t)
+    , DimensionAccessHelperJit(t, padded)
+    , TransposedDimensionAccessHelperBase(t, order) {}
+
+    std::string x() { return dims_sizes[transposed_order[7]]; }
+    std::string y() { return dims_sizes[transposed_order[6]]; }
+    std::string z() { return dims_sizes[transposed_order[5]]; }
+    std::string w() { return dims_sizes[transposed_order[4]]; }
+    std::string v() { return dims_sizes[transposed_order[3]]; }
+    std::string u() { return dims_sizes[transposed_order[2]]; }
+    std::string f() { return dims_sizes[transposed_order[1]]; }
+    std::string b() { return dims_sizes[transposed_order[0]]; }
+
+    std::pair<std::string, std::string> x_pad() {
+        return {pad_before_after_sizes[(transposed_order[7] * 2) + 0], pad_before_after_sizes[(transposed_order[7] * 2) + 1]};
+    }
+    std::pair<std::string, std::string> y_pad() {
+        return {pad_before_after_sizes[(transposed_order[6] * 2) + 0], pad_before_after_sizes[(transposed_order[6] * 2) + 1]};
+    }
+    std::pair<std::string, std::string> z_pad() {
+        return {pad_before_after_sizes[(transposed_order[5] * 2) + 0], pad_before_after_sizes[(transposed_order[5] * 2) + 1]};
+    }
+    std::pair<std::string, std::string> w_pad() {
+        return {pad_before_after_sizes[(transposed_order[4] * 2) + 0], pad_before_after_sizes[(transposed_order[4] * 2) + 1]};
+    }
+    std::pair<std::string, std::string> v_pad() {
+        return {pad_before_after_sizes[(transposed_order[3] * 2) + 0], pad_before_after_sizes[(transposed_order[3] * 2) + 1]};
+    }
+    std::pair<std::string, std::string> u_pad() {
+        return {pad_before_after_sizes[(transposed_order[2] * 2) + 0], pad_before_after_sizes[(transposed_order[2] * 2) + 1]};
+    }
+    std::pair<std::string, std::string> f_pad() {
+        return {pad_before_after_sizes[(transposed_order[1] * 2) + 0], pad_before_after_sizes[(transposed_order[1] * 2) + 1]};
+    }
+    std::pair<std::string, std::string> b_pad() {
+        return {pad_before_after_sizes[(transposed_order[0] * 2) + 0], pad_before_after_sizes[(transposed_order[0] * 2) + 1]};
+    }
+};
+
+struct sdpa_configuration {
+    int64_t head_size = -1;
+    int64_t heads_num = -1;
+    int64_t kv_heads_num = -1;
+
+    // GQA configuration
+    int64_t group_size = -1;
+    int64_t broadcast_axis = -1;
+
+    bool is_causal = false;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// sdpa_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct sdpa_params : public base_params {
+    sdpa_params() : base_params(KernelType::SDPA) {}
+
+    std::vector<int64_t> input0_order;
+    std::vector<int64_t> input1_order;
+    std::vector<int64_t> input2_order;
+    std::vector<int64_t> output_order;
+
+    sdpa_configuration conf;
+};
+
+struct sdpa_fuse_params : fuse_params {
+    sdpa_fuse_params() : fuse_params(KernelType::SDPA) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SDPAKernelBase
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class SDPAKernelBase : public KernelBaseOpenCL {
+public:
+    using KernelBaseOpenCL::KernelBaseOpenCL;
+    virtual ~SDPAKernelBase() {}
+
+    struct DispatchData : public CommonDispatchData {};
+
+protected:
+    bool Validate(const Params& p) const override;
+    JitConstants GetJitConstants(const sdpa_params& params) const;
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
new file mode 100644
index 00000000000000..581565874f7fbb
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
@@ -0,0 +1,258 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "sdpa_kernel_opt.h"
+#include "kernel_selector_utils.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+
+constexpr size_t subgroup_size = 16;
+
+enum KernelsTypes {
+    SINGLE_TOKEN = 0,
+    MULTI_TOKENS,
+    FINALIZATION,
+    TOTAL_KERNELS_NUM
+};
+
+static size_t get_target_seq_len_block_size() {
+    const size_t block_size = 16;
+    return block_size;
+}
+
+
+static size_t get_seq_len_partition_size() {
+    const size_t seq_len = 256;
+    return seq_len;
+}
+
+ParamsKey SDPAKernelOpt::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+
+    k.EnableDifferentTypes();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDynamicShapesSupport();
+
+    return k;
+}
+
+bool SDPAKernelOpt::Validate(const Params& p) const {
+    if (!Parent::Validate(p))
+        return false;
+
+    const sdpa_params& params = static_cast<const sdpa_params&>(p);
+
+    if (params.conf.head_size < 1 || params.conf.head_size % subgroup_size != 0)
+        return false;
+
+    return true;
+}
+
+JitConstants SDPAKernelOpt::GetJitConstants(const sdpa_params& params, size_t kernel_idx) const {
+    auto jit = SDPAKernelBase::GetJitConstants(params);
+
+    const auto softmax_acc_dt = params.inputs[0].GetDType();
+    jit.Merge(MakeTypeJitConstants(softmax_acc_dt, "SOFTMAX_ACCUMULATOR"));
+
+    const auto& config = params.conf;
+    jit.AddConstant(MakeJitConstant("SUBGROUP_SIZE", subgroup_size));
+    jit.AddConstant(MakeJitConstant("HEAD_SIZE", config.head_size));
+    jit.AddConstant(MakeJitConstant("SEQ_LEN_PARTITION_SIZE", get_seq_len_partition_size()));
+
+    auto target_seq_len_block_size = kernel_idx == KernelsTypes::SINGLE_TOKEN ? 1 : get_target_seq_len_block_size();
+    jit.AddConstant(MakeJitConstant("TARGET_SEQ_LEN_BLOCK_SIZE", target_seq_len_block_size));
+
+    auto sdpa_stage = kernel_idx == KernelsTypes::FINALIZATION ? 1 : 0;
+    jit.AddConstant(MakeJitConstant("SDPA_STAGE_" + std::to_string(sdpa_stage), 1));
+
+    return jit;
+}
+
+CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t kernel_idx) const {
+    CommonDispatchData dispatch_data;
+
+    const auto& query_input = params.inputs[0];
+
+    if (!query_input.is_dynamic()) {
+        TransposedDimensionAccessHelperBase dims_q(params.inputs[0], params.input0_order);
+        TransposedDimensionAccessHelperBase dims_k(params.inputs[1], params.input1_order);
+        TransposedDimensionAccessHelperBase output(params.outputs[0], params.output_order);
+
+        const size_t batch_size = output.b_dim().v;
+        const size_t heads_num = output.f_dim().v;
+        const size_t source_seq_len = dims_k.y_dim().v;
+        const size_t target_seq_len = dims_q.y_dim().v;
+        const size_t head_size = static_cast<size_t>(params.conf.head_size);
+        const size_t num_of_partitions = CeilDiv(source_seq_len, get_seq_len_partition_size());
+        const size_t target_seq_len_block_size = kernel_idx == 1 ? get_target_seq_len_block_size() : 1;
+
+        if (kernel_idx == KernelsTypes::SINGLE_TOKEN || kernel_idx == KernelsTypes::MULTI_TOKENS) {
+            dispatch_data.gws = { batch_size * heads_num,
+                                  CeilDiv(target_seq_len, target_seq_len_block_size),
+                                  head_size * num_of_partitions };
+            dispatch_data.lws = { 1, 1, head_size };
+        } else if (kernel_idx == 2) {
+            dispatch_data.gws = { batch_size * heads_num,
+                                  target_seq_len,
+                                  16 };
+            dispatch_data.lws = { 1, 1, 16 };
+        }
+    }
+
+    return dispatch_data;
+}
+
+KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const {
+    if (!Validate(params)) {
+        return {};
+    }
+
+    // Implementation contains multiple kernels:
+    // kernel[0] - single token generation stage (2nd token)
+    // kernel[1] - multi tokens processing stage (1st token)
+    // kernel[2] - results aggregation
+
+    const size_t kernels_num = KernelsTypes::TOTAL_KERNELS_NUM;
+    KernelData kd = KernelData::Default<sdpa_params>(params, kernels_num);
+    kd.needs_sub_kernels_sync = true;
+
+    GetUpdateDispatchDataFunc(kd);
+
+    const auto& prim_params = dynamic_cast<const sdpa_params&>(params);
+    for (size_t kernel_idx = 0; kernel_idx < kernels_num; kernel_idx++) {
+        auto dispatch_data = SetDefault(prim_params, kernel_idx);
+        auto kernel_name = kernel_idx == 0 ? kernelName + "_single_token" :
+                                             kernel_idx == 1 ? kernelName + "_multi_tokens" : kernelName + "_finalization";
+        auto entry_point = GetEntryPoint(kernel_name, prim_params.layerID, params);
+        auto jit_constants = GetJitConstants(prim_params, kernel_idx);
+        auto jit = CreateJit(kernel_name, jit_constants, entry_point);
+
+        auto& kernel = kd.kernels[kernel_idx];
+
+        auto inputs_num =
+            kernel_idx == KernelsTypes::FINALIZATION ? 0 : static_cast<int>(prim_params.inputs.size());
+
+        FillCLKernelData(kernel,
+                         dispatch_data,
+                         params.engineInfo,
+                         kernelName,
+                         jit,
+                         entry_point,
+                         {},
+                         false,
+                         false,
+                         inputs_num,
+                         GetFusedPrimitiveInputsCount(params),
+                         static_cast<int>(prim_params.outputs.size()),
+                         prim_params.is_shape_agnostic);
+
+        const auto num_of_partitions = 1;
+        auto& output = prim_params.outputs[0];
+        auto head_size = output.X().v;
+
+        auto buf_dt_size = 4;
+        auto buf_elements_count = (num_of_partitions == 1) ? 1 : output.LogicalSize() / head_size * num_of_partitions;
+        auto buf_size = buf_elements_count * buf_dt_size;
+
+        auto tmp_out_dt_size = 4;
+        auto tmp_out_elements_count = (num_of_partitions == 1) ? 1 : output.LogicalSize() * num_of_partitions;
+        auto tmp_out_size = tmp_out_elements_count * tmp_out_dt_size;
+
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2});
+
+        kd.internalBufferSizes.clear();
+        kd.internalBufferSizes.push_back(buf_size);
+        kd.internalBufferSizes.push_back(buf_size);
+        kd.internalBufferSizes.push_back(tmp_out_size);
+        kd.internalBufferDataType = prim_params.inputs[0].GetDType();
+
+        if (kernel_idx == KernelsTypes::FINALIZATION) {
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0});
+
+            ScalarDescriptor num_of_partitions_scalar;
+            num_of_partitions_scalar.t = ScalarDescriptor::Types::UINT32;
+            num_of_partitions_scalar.v.u32 = num_of_partitions;
+
+            kernel.params.scalars.clear();
+            kernel.params.scalars.push_back(num_of_partitions_scalar);
+        }
+    }
+
+    return { kd };
+}
+
+void SDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const {
+    kd.update_dispatch_data_func = [this](const Params& params, KernelData& kernel_data) {
+        const auto& prim_params = static_cast<const sdpa_params&>(params);
+
+        const size_t expected_kernels_num = KernelsTypes::TOTAL_KERNELS_NUM;
+        OPENVINO_ASSERT(kernel_data.kernels.size() == expected_kernels_num,
+                        "[GPU] Invalid kernels size for update dispatch data func of SDPA kernel");
+
+        TransposedDimensionAccessHelperBase dims_q(prim_params.inputs[0], prim_params.input0_order);
+        TransposedDimensionAccessHelperBase dims_k(prim_params.inputs[1], prim_params.input1_order);
+        auto& output = prim_params.outputs[0];
+
+        auto target_seq_len = dims_q.y_dim().v;
+        auto head_size = dims_q.x_dim().v;
+        auto source_seq_len = dims_k.y_dim().v;
+
+        auto num_of_partitions = CeilDiv(source_seq_len, get_seq_len_partition_size());
+
+        auto buf_dt_size = output.ElementSize();
+        auto buf_elements_count = (num_of_partitions == 1) ? 1 : output.LogicalSize() / head_size * num_of_partitions;
+        auto buf_size = buf_elements_count * buf_dt_size;
+
+        auto tmp_out_dt_size = output.ElementSize();
+        auto tmp_out_elements_count = (num_of_partitions == 1) ? 1 : output.LogicalSize() * num_of_partitions;
+        auto tmp_out_size = tmp_out_elements_count * tmp_out_dt_size;
+
+        auto dispatch_data1 = SetDefault(prim_params, 0);
+        kernel_data.kernels[0].params.workGroups.global = dispatch_data1.gws;
+        kernel_data.kernels[0].params.workGroups.local = dispatch_data1.lws;
+        kernel_data.kernels[0].skip_execution = target_seq_len > 1;
+
+        auto dispatch_data2 = SetDefault(prim_params, 1);
+        kernel_data.kernels[1].params.workGroups.global = dispatch_data2.gws;
+        kernel_data.kernels[1].params.workGroups.local = dispatch_data2.lws;
+        kernel_data.kernels[1].skip_execution = target_seq_len == 1;
+
+        ScalarDescriptor num_of_partitions_scalar;
+        num_of_partitions_scalar.t = ScalarDescriptor::Types::UINT32;
+        num_of_partitions_scalar.v.u32 = static_cast<uint32_t>(num_of_partitions);
+
+        auto dispatch_data3 = SetDefault(prim_params, 2);
+        kernel_data.kernels[2].params.workGroups.global = dispatch_data3.gws;
+        kernel_data.kernels[2].params.workGroups.local = dispatch_data3.lws;
+        kernel_data.kernels[2].skip_execution = num_of_partitions == 1;
+
+        kernel_data.kernels[2].params.scalars.clear();
+        kernel_data.kernels[2].params.scalars.push_back(num_of_partitions_scalar);
+
+        kernel_data.internalBufferSizes.clear();
+        kernel_data.internalBufferSizes.push_back(buf_size);
+        kernel_data.internalBufferSizes.push_back(buf_size);
+        kernel_data.internalBufferSizes.push_back(tmp_out_size);
+        kernel_data.internalBufferDataType = prim_params.inputs[0].GetDType();
+    };
+}
+
+KernelsPriority SDPAKernelOpt::GetKernelsPriority(const Params& /*params*/) const {
+    return FORCE_PRIORITY_1;
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h
new file mode 100644
index 00000000000000..8d7279f5546112
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h
@@ -0,0 +1,29 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "sdpa_kernel_base.h"
+
+namespace kernel_selector {
+class SDPAKernelOpt : public SDPAKernelBase {
+public:
+    using Parent = SDPAKernelBase;
+    SDPAKernelOpt() : SDPAKernelBase("sdpa_opt") {}
+    virtual ~SDPAKernelOpt() {}
+
+    KernelsData GetKernelsData(const Params& params) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& p) const override;
+    void GetUpdateDispatchDataFunc(KernelData& kd) const override;
+    CommonDispatchData SetDefault(const sdpa_params& params, size_t kernel_idx) const;
+    JitConstants GetJitConstants(const sdpa_params& params, size_t kernel_idx) const;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return {};
+    }
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp
new file mode 100644
index 00000000000000..a80f3c31dfc8f3
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp
@@ -0,0 +1,110 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "sdpa_kernel_ref.h"
+#include "kernel_selector_utils.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+
+ParamsKey SDPAKernelRef::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+
+    k.EnableDifferentTypes();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDynamicShapesSupport();
+
+    return k;
+}
+
+JitConstants SDPAKernelRef::GetJitConstants(const sdpa_params& params) const {
+    auto jit = SDPAKernelBase::GetJitConstants(params);
+
+    auto acc_dt = params.inputs[0].GetDType();
+    jit.Merge(MakeTypeJitConstants(acc_dt, "ACCUMULATOR"));
+
+    TransposedDimensionAccessHelperJit dims_q(params.inputs[0], params.input0_order);
+    jit.AddConstant(MakeJitConstant("HEAD_SIZE", dims_q.x()));
+
+    return jit;
+}
+
+CommonDispatchData SDPAKernelRef::SetDefault(const sdpa_params& params) const {
+    CommonDispatchData dispatchData;
+
+    const auto& output = params.outputs[0];
+    dispatchData.gws = { output.Batch().v * output.Feature().v, output.Y().v, output.X().v };
+    dispatchData.lws = { 1, 1, output.X().v };
+
+    return dispatchData;
+}
+
+KernelsData SDPAKernelRef::GetKernelsData(const Params& params) const {
+    KernelData kd = KernelData::Default<sdpa_params>(params);
+    const auto& prim_params = dynamic_cast<const sdpa_params&>(params);
+
+    if (!Validate(params)) {
+        return {};
+    }
+
+    auto dispatchData = SetDefault(prim_params);
+    auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params);
+    auto cldnn_jit = GetJitConstants(prim_params);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+    auto& kernel = kd.kernels[0];
+
+    GetUpdateDispatchDataFunc(kd);
+
+    FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
+                     "", false, false, static_cast<int>(prim_params.inputs.size()),
+                     GetFusedPrimitiveInputsCount(params), 1, prim_params.is_shape_agnostic);
+
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+
+    kd.internalBufferSizes.clear();
+    kd.internalBufferSizes.push_back(prim_params.inputs[0].ElementSize());
+    kd.internalBufferDataType = prim_params.inputs[0].GetDType();
+
+    return { kd };
+}
+
+void SDPAKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) const {
+    kd.update_dispatch_data_func = [this](const Params& params, KernelData& kernel_data) {
+        const auto& prim_params = static_cast<const sdpa_params&>(params);
+        auto dispatchData = SetDefault(prim_params);
+        OPENVINO_ASSERT(kernel_data.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
+        kernel_data.kernels[0].params.workGroups.global = dispatchData.gws;
+        kernel_data.kernels[0].params.workGroups.local = dispatchData.lws;
+        kernel_data.kernels[0].skip_execution = KernelData::SkipKernelExecution(prim_params);
+
+        auto& in_q = prim_params.inputs[0];
+        auto& in_k = prim_params.inputs[1];
+        TransposedDimensionAccessHelperBase dims_q(in_q, prim_params.input0_order);
+        TransposedDimensionAccessHelperBase dims_k(in_k, prim_params.input1_order);
+
+        auto elem_size = in_q.ElementSize();
+        auto batch_size = in_q.LogicalSize() / dims_q.x_dim().v / dims_q.y_dim().v;
+        kernel_data.internalBufferSizes.clear();
+        kernel_data.internalBufferSizes.push_back(batch_size * dims_q.y_dim().v * dims_k.y_dim().v * elem_size);
+
+        kernel_data.internalBufferDataType = in_q.GetDType();
+    };
+}
+
+KernelsPriority SDPAKernelRef::GetKernelsPriority(const Params& /*params*/) const {
+    return DONT_USE_IF_HAVE_SOMETHING_ELSE;
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.h
new file mode 100644
index 00000000000000..c570f32cc1e94e
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.h
@@ -0,0 +1,28 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "sdpa_kernel_base.h"
+
+namespace kernel_selector {
+class SDPAKernelRef : public SDPAKernelBase {
+public:
+    using Parent = SDPAKernelBase;
+    SDPAKernelRef() : SDPAKernelBase("sdpa_ref") {}
+    virtual ~SDPAKernelRef() {}
+
+    KernelsData GetKernelsData(const Params& params) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    void GetUpdateDispatchDataFunc(KernelData& kd) const override;
+    CommonDispatchData SetDefault(const sdpa_params& params) const;
+    JitConstants GetJitConstants(const sdpa_params& params) const;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return {};
+    }
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
new file mode 100644
index 00000000000000..b58f04f23e2643
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "sdpa_kernel_selector.h"
+#include "sdpa_kernel_ref.h"
+#include "sdpa_kernel_opt.h"
+
+namespace kernel_selector {
+
+sdpa_kernel_selector::sdpa_kernel_selector() {
+    Attach<SDPAKernelOpt>();
+    Attach<SDPAKernelRef>();
+}
+
+KernelsData sdpa_kernel_selector::GetBestKernels(const Params& params) const {
+    return GetNaiveBestKernel(params, KernelType::SDPA);
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.h
new file mode 100644
index 00000000000000..e4a5f245bfe18b
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.h
@@ -0,0 +1,23 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_selector.h"
+
+namespace kernel_selector {
+class sdpa_kernel_selector : public kernel_selector_base {
+public:
+    static sdpa_kernel_selector& Instance() {
+        static sdpa_kernel_selector instance_;
+        return instance_;
+    }
+
+    sdpa_kernel_selector();
+
+    virtual ~sdpa_kernel_selector() {}
+
+    KernelsData GetBestKernels(const Params& params) const override;
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp
index ee6f39c3c3c71e..34279dd7de148c 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp
@@ -122,7 +122,7 @@ JitConstants SliceKernelRef::GetJitConstants(const slice_params& params) const {
 
     // Define axes size as constant:
     if (params.compile_time_axes.empty()) {
-        kernel_selector::DimensionAccessHelper dims(params.inputs.back());
+        kernel_selector::DimensionAccessHelperJit dims(params.inputs.back());
         jit.AddConstant(MakeJitConstant(JIT_AXES_BUFF_SIZE_NAME,
                                         toVectorMulString({dims.b(), dims.f(), dims.x(), dims.y(), dims.z()})));
     } else {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp
index 335c2bc1017303..338ed8d3fb1077 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp
@@ -115,7 +115,7 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis
 
     if (params.has_dynamic_tensors()) {
         const auto& input = params.inputs[0];
-        DimensionAccessHelper dims(input);
+        DimensionAccessHelperJit dims(input);
         auto softmax_dim_y_bfyx = (params.dim == SoftmaxDim::Y && input.GetLayout() == DataLayout::bfyx);
         auto softmax_dim_x_bfyx = (params.dim == SoftmaxDim::X && input.GetLayout() == DataLayout::bfyx);
         const std::string lws_0 = "get_local_size(0)";
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp
index 5aafdd309ae6d0..5d20503919241b 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp
@@ -216,7 +216,7 @@ JitConstants UniqueCountKernelRef::GetJitConstants(const unique_count_params& ke
     }
 
     if (input.is_dynamic()) {
-        DimensionAccessHelper dims(input);
+        DimensionAccessHelperJit dims(input);
         const std::string total_data_size =
             toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()});
         jit_constants.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", total_data_size));
@@ -326,7 +326,7 @@ JitConstants UniqueGatherKernelRef::GetJitConstants(const unique_gather_params&
     }
 
     if (input.is_dynamic()) {
-        DimensionAccessHelper dims(input);
+        DimensionAccessHelperJit dims(input);
         const std::string total_data_size =
             toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()});
         jit_constants.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", total_data_size));
diff --git a/src/plugins/intel_gpu/src/plugin/ops/roi_align_rotated.cpp b/src/plugins/intel_gpu/src/plugin/ops/roi_align_rotated.cpp
index 306bab54721ddd..fd50b4dcc76425 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/roi_align_rotated.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/roi_align_rotated.cpp
@@ -14,7 +14,7 @@ namespace intel_gpu {
 
 namespace {
 
-void CreateROIAlignRotatedOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v14::ROIAlignRotated>& op) {
+void CreateROIAlignRotatedOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v15::ROIAlignRotated>& op) {
     validate_inputs_count(op, {3});
     auto roi_align_prim = cldnn::roi_align(layer_type_name_ID(op),
                                            p.GetInputInfo(op),
@@ -31,7 +31,7 @@ void CreateROIAlignRotatedOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v1
 
 }  // anonymous namespace
 
-REGISTER_FACTORY_IMPL(v14, ROIAlignRotated);
+REGISTER_FACTORY_IMPL(v15, ROIAlignRotated);
 
 }  // namespace intel_gpu
 }  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp
new file mode 100644
index 00000000000000..c07c501a1f970b
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/plugin/program_builder.hpp"
+#include "intel_gpu/plugin/common_utils.hpp"
+
+#include "intel_gpu/op/sdpa.hpp"
+
+#include "openvino/op/scaled_dot_product_attention.hpp"
+
+#include "intel_gpu/primitives/scaled_dot_product_attention.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+using SDPA = ov::intel_gpu::op::SDPA;
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
+
+namespace ov {
+namespace intel_gpu {
+
+static void CreateScaledDotProductAttentionOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v13::ScaledDotProductAttention>& op) {
+    validate_inputs_count(op, {3, 4, 5});
+    auto inputs = p.GetInputInfo(op);
+    auto layerName = layer_type_name_ID(op);
+
+    bool is_causal = op->get_causal();
+    auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName,
+                                                         inputs,
+                                                         is_causal);
+
+    p.add_primitive(*op, sdpa_prim);
+}
+
+static void CreateSDPAOp(ProgramBuilder& p, const std::shared_ptr<ov::op::internal::SDPA>& op) {
+    validate_inputs_count(op, {3, 4, 5});
+    auto inputs = p.GetInputInfo(op);
+    auto layerName = layer_type_name_ID(op);
+
+    bool is_causal = op->get_causal();
+    auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName,
+                                                         inputs,
+                                                         is_causal,
+                                                         op->get_input0_transpose_order(),
+                                                         op->get_input1_transpose_order(),
+                                                         op->get_input2_transpose_order(),
+                                                         op->get_output_transpose_order());
+
+    p.add_primitive(*op, sdpa_prim);
+}
+
+REGISTER_FACTORY_IMPL(internal, SDPA);
+REGISTER_FACTORY_IMPL(v13, ScaledDotProductAttention);
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp
new file mode 100644
index 00000000000000..67e927abb43f97
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp
@@ -0,0 +1,171 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/op/sdpa.hpp"
+#include "intel_gpu/plugin/common_utils.hpp"
+#include "scaled_dot_product_attention_shape_inference.hpp"
+#include "openvino/core/partial_shape.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/broadcast.hpp"
+#include "openvino/op/reshape.hpp"
+
+namespace ov {
+namespace intel_gpu {
+namespace op {
+
+SDPA::SDPA(const ov::Output<Node>& Q,
+           const ov::Output<Node>& K,
+           const ov::Output<Node>& V,
+           const std::vector<int64_t>& order_q,
+           const std::vector<int64_t>& order_k,
+           const std::vector<int64_t>& order_v,
+           const std::vector<int64_t>& order_out,
+           const bool is_causal,
+           const ov::element::Type output_type)
+    : m_order_q(order_q)
+    , m_order_k(order_k)
+    , m_order_v(order_v)
+    , m_order_out(order_out)
+    , m_is_causal(is_causal)
+    , m_output_type(output_type) {
+    set_arguments({Q, K, V});
+    validate_and_infer_types();
+}
+
+SDPA::SDPA(const ov::Output<Node>& Q,
+           const ov::Output<Node>& K,
+           const ov::Output<Node>& V,
+           const ov::Output<Node>& attn_mask,
+           const std::vector<int64_t>& order_q,
+           const std::vector<int64_t>& order_k,
+           const std::vector<int64_t>& order_v,
+           const std::vector<int64_t>& order_out,
+           const bool is_causal,
+           const ov::element::Type output_type)
+    : m_order_q(order_q)
+    , m_order_k(order_k)
+    , m_order_v(order_v)
+    , m_order_out(order_out)
+    , m_is_causal(is_causal)
+    , m_output_type(output_type) {
+    set_arguments({Q, K, V, attn_mask});
+    validate_and_infer_types();
+}
+
+SDPA::SDPA(const ov::Output<Node>& Q,
+           const ov::Output<Node>& K,
+           const ov::Output<Node>& V,
+           const ov::Output<Node>& attn_mask,
+           const ov::Output<Node>& scale,
+           const std::vector<int64_t>& order_q,
+           const std::vector<int64_t>& order_k,
+           const std::vector<int64_t>& order_v,
+           const std::vector<int64_t>& order_out,
+           const bool is_causal,
+           const ov::element::Type output_type)
+    : m_order_q(order_q)
+    , m_order_k(order_k)
+    , m_order_v(order_v)
+    , m_order_out(order_out)
+    , m_is_causal(is_causal)
+    , m_output_type(output_type) {
+    set_arguments({Q, K, V, attn_mask, scale});
+    validate_and_infer_types();
+}
+
+std::shared_ptr<ov::Node> SDPA::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    check_new_args_count(this, new_args);
+
+    return std::make_shared<SDPA>(new_args.at(0), new_args.at(1), new_args.at(2), m_order_q, m_order_k, m_order_v, m_order_out, m_is_causal, m_output_type);
+}
+
+void SDPA::validate_and_infer_types() {
+    const auto input_size = get_input_size();
+    NODE_VALIDATION_CHECK(this,
+        input_size == 3 || input_size == 4 || input_size == 5,
+        "Number of inputs is incorrect. Current value is: ",
+        input_size,
+        ", expected 3, 4 or 5.");
+
+    std::vector<ov::PartialShape> input_shapes;
+    for (size_t i = 0; i < input_size; i++) {
+        input_shapes.push_back(get_input_partial_shape(i));
+    }
+
+    auto out_shapes = shape_infer(this,
+                                  input_shapes,
+                                  m_order_q,
+                                  m_order_k,
+                                  m_order_v,
+                                  m_order_out);
+
+    auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type;
+    set_output_type(0, output_type, out_shapes[0]);
+}
+
+bool SDPA::visit_attributes(ov::AttributeVisitor &visitor) {
+    visitor.on_attribute("order_q", m_order_q);
+    visitor.on_attribute("order_k", m_order_k);
+    visitor.on_attribute("order_v", m_order_v);
+    visitor.on_attribute("order_out", m_order_out);
+    visitor.on_attribute("output_type", m_output_type);
+    return true;
+}
+
+std::vector<ov::PartialShape> shape_infer(const SDPA* op,
+                                          std::vector<ov::PartialShape> input_shapes,
+                                          const std::vector<int64_t>& order_q,
+                                          const std::vector<int64_t>& order_k,
+                                          const std::vector<int64_t>& order_v,
+                                          const std::vector<int64_t>& order_out) {
+    auto shape_q = input_shapes[0];
+    auto shape_k = input_shapes[1];
+    auto shape_v = input_shapes[2];
+
+    // transposed shape
+    auto transpose_pshape = [](const ov::PartialShape pshape, const std::vector<int64_t>& order) {
+        auto transposed_pshape = ov::PartialShape::dynamic(pshape.rank());
+        for (size_t i = 0; i < order.size(); i++) {
+            transposed_pshape[i] = pshape[order[i]];
+        }
+
+        return transposed_pshape;
+    };
+
+    auto shape_q_t = (order_q.size() > 1) ? transpose_pshape(shape_q, order_q) : shape_q;
+    auto shape_k_t = (order_k.size() > 1) ? transpose_pshape(shape_k, order_k) : shape_k;
+    auto shape_v_t = (order_v.size() > 1) ? transpose_pshape(shape_v, order_v) : shape_v;
+
+    const auto is_broadcastable = shape_k_t.rank().is_static() &&
+                                  shape_v_t.rank().is_static() &&
+                                  ((shape_q_t.size() == shape_k_t.size()) && (shape_q_t.size() == shape_v_t.size()));
+    if (is_broadcastable) {
+        size_t max_rank = shape_q_t.size();
+        for (size_t i = 0; i < max_rank; ++i) {
+            if (shape_q_t[i].is_static() && shape_k_t[i].is_static() && shape_v_t[i].is_static()) {
+                auto broadcasted_dim = shape_q_t[i].get_length();
+                shape_k_t[i] = broadcasted_dim;
+                shape_v_t[i] = broadcasted_dim;
+            }
+        }
+    }
+
+    std::vector<ov::PartialShape> transposed_input_shapes{ shape_q_t, shape_k_t, shape_v_t };
+    for (size_t i = 3; i < transposed_input_shapes.size(); i++) {
+        transposed_input_shapes.push_back(input_shapes[i]);
+    }
+
+    OPENVINO_ASSERT(op != nullptr, "op should not be nullptr for shape_infer.");
+    auto out_shapes = ov::op::v13::shape_infer(dynamic_cast<const ov::op::v13::ScaledDotProductAttention*>(op), transposed_input_shapes);
+
+    if (order_out.size() > 0) {
+        return { transpose_pshape(out_shapes[0], order_out) };
+    } else {
+        return { out_shapes[0] };
+    }
+}
+
+}  // namespace op
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp
similarity index 56%
rename from src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp
rename to src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp
index e57a7978a5e7bf..614a42845ec521 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp
@@ -3,14 +3,16 @@
 //
 
 #include "intel_gpu/op/gemm.hpp"
+#include "intel_gpu/op/sdpa.hpp"
 #include "openvino/core/node_vector.hpp"
 #include "openvino/core/partial_shape.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/pass/pattern/op/label.hpp"
 #include "openvino/pass/pattern/op/pattern.hpp"
-#include "transpose_matmul_fusion.hpp"
+#include "transpose_fusion.hpp"
 #include "openvino/op/matmul.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
 #include "openvino/op/convert.hpp"
 #include "openvino/op/transpose.hpp"
 #include "openvino/core/rt_info.hpp"
@@ -25,23 +27,133 @@ using ov::pass::pattern::op::Or;
 namespace ov {
 namespace intel_gpu {
 
-class TransposeMatMulMatcher : public ov::pass::MatcherPass {
-public:
-    OPENVINO_RTTI("TransposeMatMulMatcher", "0");
-    TransposeMatMulMatcher();
-};
-
-class TransposeMatMulTransposeMatcher : public ov::pass::MatcherPass {
-public:
-    OPENVINO_RTTI("TransposeMatMulTransposeMatcher", "0");
-    TransposeMatMulTransposeMatcher();
-};
-
-TransposeMatMulFusion::TransposeMatMulFusion() {
+TransposeFusion::TransposeFusion() {
     add_matcher<TransposeMatMulTransposeMatcher>();
     add_matcher<TransposeMatMulMatcher>();
+    add_matcher<TransposeSDPAMatcher>();
 }
 
+TransposeSDPAMatcher::TransposeSDPAMatcher() {
+    auto is_fp_type = [](const ov::Output<ov::Node>& output) -> bool {
+        switch (output.get_element_type()) {
+            case ov::element::f16:
+            case ov::element::f32: return true;
+            default: return false;
+        }
+    };
+    auto not_transpose = [is_fp_type](const ov::Output<ov::Node>& output) -> bool {
+        return std::dynamic_pointer_cast<ov::op::v1::Transpose>(output.get_node_shared_ptr()) == nullptr
+               && is_fp_type(output);
+    };
+    auto is_dynamic = [](const ov::Output<ov::Node>& output) -> bool {
+        bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic();
+        size_t num_inputs = output.get_node_shared_ptr()->get_input_size();
+        for (size_t idx = 0; idx < num_inputs; idx++) {
+            is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic();
+        }
+        return is_dynamic;
+    };
+
+    auto input_q_m = any_input(not_transpose);
+    auto input_k_m = any_input(not_transpose);
+    auto input_v_m = any_input(not_transpose);
+    auto input_attn_mask = any_input(not_transpose);
+    auto input_scale = any_input(not_transpose);
+    auto transpose_q_order_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
+    auto transpose_k_order_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
+    auto transpose_v_order_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
+    auto transpose_q_m = wrap_type<ov::op::v1::Transpose>({input_q_m, transpose_q_order_m}, is_fp_type);
+    auto transpose_k_m = wrap_type<ov::op::v1::Transpose>({input_k_m, transpose_k_order_m}, is_fp_type);
+    auto transpose_v_m = wrap_type<ov::op::v1::Transpose>({input_v_m, transpose_v_order_m}, is_fp_type);
+
+    auto sdpa_in_q = std::make_shared<Or>(OutputVector{input_q_m, transpose_q_m});
+    auto sdpa_in_k = std::make_shared<Or>(OutputVector{input_k_m, transpose_k_m});
+    auto sdpa_in_v = std::make_shared<Or>(OutputVector{input_v_m, transpose_v_m});
+
+    auto sdpa_without_attn_mask_m = wrap_type<ov::op::v13::ScaledDotProductAttention>({ sdpa_in_q, sdpa_in_k, sdpa_in_v }, is_dynamic);
+    auto sdpa_with_attn_mask_m = wrap_type<ov::op::v13::ScaledDotProductAttention>({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask }, is_dynamic);
+    auto sdpa_with_attn_mask_and_scale_m =
+        wrap_type<ov::op::v13::ScaledDotProductAttention>({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask, input_scale }, is_dynamic);
+
+    auto sdpa_m = std::make_shared<Or>(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m});
+
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+
+        std::shared_ptr<ov::op::v13::ScaledDotProductAttention> sdpa;
+        if (pattern_map.find(sdpa_without_attn_mask_m) != pattern_map.end()) {
+            sdpa = std::dynamic_pointer_cast<ov::op::v13::ScaledDotProductAttention>(pattern_map.at(sdpa_without_attn_mask_m).get_node_shared_ptr());
+        } else if (pattern_map.find(sdpa_with_attn_mask_m) != pattern_map.end()) {
+            sdpa = std::dynamic_pointer_cast<ov::op::v13::ScaledDotProductAttention>(pattern_map.at(sdpa_with_attn_mask_m).get_node_shared_ptr());
+        } else if (pattern_map.find(sdpa_with_attn_mask_and_scale_m) != pattern_map.end()) {
+            sdpa = std::dynamic_pointer_cast<ov::op::v13::ScaledDotProductAttention>(pattern_map.at(sdpa_with_attn_mask_and_scale_m).get_node_shared_ptr());
+        }
+
+        if (!sdpa || transformation_callback(sdpa)) {
+            return false;
+        }
+
+        auto order_q = op::SDPA::default_order(sdpa->get_input_partial_shape(0).size());
+        auto order_k = op::SDPA::default_order(sdpa->get_input_partial_shape(1).size());
+        auto order_v = op::SDPA::default_order(sdpa->get_input_partial_shape(2).size());
+        auto order_output = op::SDPA::default_order(sdpa->get_output_partial_shape(0).size());
+        size_t input_q_output_idx = sdpa->get_input_source_output(0).get_index();
+        size_t input_k_output_idx = sdpa->get_input_source_output(1).get_index();
+        size_t input_v_output_idx = sdpa->get_input_source_output(2).get_index();
+
+        if (pattern_map.count(transpose_q_m) > 0) {
+            auto tranpose_a_order = std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(transpose_q_order_m).get_node_shared_ptr());
+            order_q = tranpose_a_order->cast_vector<int64_t>();
+            if (order_q.back() != static_cast<int64_t>(order_q.size() - 1)) // Allow any transposes without head_size dim position change
+                return false;
+
+            auto tranpose_a = std::dynamic_pointer_cast<ov::op::v1::Transpose>(pattern_map.at(transpose_q_m).get_node_shared_ptr());
+            input_q_output_idx = tranpose_a->get_input_source_output(0).get_index();
+        }
+        if (pattern_map.count(transpose_k_m) > 0) {
+            auto tranpose_b_order = std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(transpose_k_order_m).get_node_shared_ptr());
+            order_k = tranpose_b_order->cast_vector<int64_t>();
+            if (order_k.back() != static_cast<int64_t>(order_k.size() - 1)) // Allow any transposes without head_size dim position change
+                return false;
+
+            auto tranpose_b = std::dynamic_pointer_cast<ov::op::v1::Transpose>(pattern_map.at(transpose_k_m).get_node_shared_ptr());
+            input_k_output_idx = tranpose_b->get_input_source_output(0).get_index();
+        }
+        if (pattern_map.count(transpose_v_m) > 0) {
+            auto tranpose_c_order = std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(transpose_v_order_m).get_node_shared_ptr());
+            order_v = tranpose_c_order->cast_vector<int64_t>();
+            if (order_v.back() != static_cast<int64_t>(order_v.size() - 1)) // Allow any transposes without head_size dim position change
+                return false;
+
+            auto tranpose_c = std::dynamic_pointer_cast<ov::op::v1::Transpose>(pattern_map.at(transpose_k_m).get_node_shared_ptr());
+            input_v_output_idx = tranpose_c->get_input_source_output(0).get_index();
+        }
+
+        auto input_q = ov::Output<Node>(pattern_map.at(input_q_m).get_node_shared_ptr(), input_q_output_idx);
+        auto input_k = ov::Output<Node>(pattern_map.at(input_k_m).get_node_shared_ptr(), input_k_output_idx);
+        auto input_v = ov::Output<Node>(pattern_map.at(input_v_m).get_node_shared_ptr(), input_v_output_idx);
+
+        std::shared_ptr<op::SDPA> sdpa_new;
+        if (pattern_map.find(sdpa_without_attn_mask_m) != pattern_map.end()) {
+            sdpa_new = std::make_shared<op::SDPA>(input_q, input_k, input_v, order_q, order_k, order_v, order_output, sdpa->get_causal());
+        } else if (pattern_map.find(sdpa_with_attn_mask_m) != pattern_map.end()) {
+            auto attn_mask = sdpa->get_input_source_output(3);
+            sdpa_new = std::make_shared<op::SDPA>(input_q, input_k, input_v, attn_mask, order_q, order_k, order_v, order_output, sdpa->get_causal());
+        } else if (pattern_map.find(sdpa_with_attn_mask_and_scale_m) != pattern_map.end()) {
+            auto attn_mask = sdpa->get_input_source_output(3);
+            auto scale = sdpa->get_input_source_output(4);
+            sdpa_new = std::make_shared<op::SDPA>(input_q, input_k, input_v, attn_mask, scale, order_q, order_k, order_v, order_output, sdpa->get_causal());
+        }
+
+        sdpa_new->set_friendly_name(sdpa->get_friendly_name());
+        ov::copy_runtime_info(m.get_matched_nodes(), sdpa_new);
+        ov::replace_node(sdpa, sdpa_new);
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(sdpa_m, "TransposeSDPAMatcher");
+    this->register_matcher(m, callback);
+}
 
 TransposeMatMulMatcher::TransposeMatMulMatcher() {
     auto is_fp_type = [](const ov::Output<ov::Node>& output) -> bool {
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.hpp
new file mode 100644
index 00000000000000..a9b3ebe05317f3
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.hpp
@@ -0,0 +1,37 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+class TransposeFusion: public ov::pass::GraphRewrite {
+public:
+    OPENVINO_RTTI("TransposeFusion", "0");
+    TransposeFusion();
+};
+
+class TransposeMatMulMatcher : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("TransposeMatMulMatcher", "0");
+    TransposeMatMulMatcher();
+};
+
+class TransposeMatMulTransposeMatcher : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("TransposeMatMulTransposeMatcher", "0");
+    TransposeMatMulTransposeMatcher();
+};
+
+class TransposeSDPAMatcher : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("TransposeSDPAMatcher", "0");
+    TransposeSDPAMatcher();
+};
+
+}   // namespace intel_gpu
+}   // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.hpp
deleted file mode 100644
index b24d76059ada11..00000000000000
--- a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "openvino/pass/graph_rewrite.hpp"
-
-namespace ov {
-namespace intel_gpu {
-
-class TransposeMatMulFusion: public ov::pass::GraphRewrite {
-public:
-    OPENVINO_RTTI("TransposeMatMulFusion", "0");
-    TransposeMatMulFusion();
-};
-
-}   // namespace intel_gpu
-}   // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp
new file mode 100644
index 00000000000000..3fdb3794585106
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp
@@ -0,0 +1,134 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "unsqueeze_broadcast_reshape_sdpa_fusion.hpp"
+
+#include "intel_gpu/op/sdpa.hpp"
+#include "intel_gpu/op/kv_cache.hpp"
+
+#include "openvino/core/rt_info.hpp"
+#include "openvino/op/broadcast.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/unsqueeze.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "openvino/pass/pattern/op/or.hpp"
+#include "transformations/utils/utils.hpp"
+
+namespace ov {
+namespace intel_gpu {
+using ov::pass::pattern::op::Or;
+
+UnsqueezeBroadcastReshapeSDPAFusion::UnsqueezeBroadcastReshapeSDPAFusion() {
+    using namespace ov::pass::pattern;
+
+    auto not_reshape = [](const ov::Output<ov::Node>& output) -> bool {
+        return std::dynamic_pointer_cast<ov::op::v1::Reshape>(output.get_node_shared_ptr()) == nullptr;
+    };
+
+    auto unsqueeze_predicate = [](const ov::Output<ov::Node>& output) -> bool {
+        return rank_equals(5)(output) && consumers_count(1);
+    };
+
+    auto broadcast_predicate = [](const ov::Output<ov::Node>& output) -> bool {
+        const auto broadcast = ov::as_type_ptr<ov::op::v3::Broadcast>(output.get_node_shared_ptr());
+        if (!broadcast || broadcast->get_broadcast_spec().m_type != ov::op::BroadcastType::BIDIRECTIONAL)
+            return false;
+        return rank_equals(5)(output) && consumers_count(1);
+    };
+
+    auto reshape_predicate = [](const ov::Output<ov::Node>& output) -> bool {
+        return rank_equals(4)(output) && consumers_count(1);
+    };
+
+    auto input_a_m = any_input(not_reshape);
+    auto input_attn_mask = any_input();
+    auto input_scale = any_input();
+    auto input_b_m = wrap_type<ov::intel_gpu::op::KVCache>({any_input(), any_input()});
+    auto input_c_m = wrap_type<ov::intel_gpu::op::KVCache>({any_input(), any_input()});
+    auto axes_const_b_m = wrap_type<ov::op::v0::Constant>();
+    auto axes_const_c_m = wrap_type<ov::op::v0::Constant>();
+    auto unsqueeze_b_m = wrap_type<ov::op::v0::Unsqueeze>({input_b_m, axes_const_b_m}, unsqueeze_predicate);
+    auto unsqueeze_c_m = wrap_type<ov::op::v0::Unsqueeze>({input_c_m, axes_const_c_m}, unsqueeze_predicate);
+    auto broadcast_b_m = wrap_type<ov::op::v3::Broadcast>({unsqueeze_b_m, any_input()}, broadcast_predicate);
+    auto broadcast_c_m = wrap_type<ov::op::v3::Broadcast>({unsqueeze_c_m, any_input()}, broadcast_predicate);
+    auto reshape_b_m = wrap_type<ov::op::v1::Reshape>({broadcast_b_m, any_input()}, reshape_predicate);
+    auto reshape_c_m = wrap_type<ov::op::v1::Reshape>({broadcast_c_m, any_input()}, reshape_predicate);
+
+    auto sdpa_without_attn_mask_m = wrap_type<op::SDPA>({ input_a_m, reshape_b_m, reshape_c_m });
+    auto sdpa_with_attn_mask_m = wrap_type<op::SDPA>({ input_a_m, reshape_b_m, reshape_c_m, input_attn_mask });
+    auto sdpa_with_attn_mask_and_scale_m = wrap_type<op::SDPA>({ input_a_m, reshape_b_m, reshape_c_m, input_attn_mask, input_scale });
+
+    auto sdpa_m = std::make_shared<Or>(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m});
+
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        if (transformation_callback(m.get_match_root())) {
+            return false;
+        }
+        const auto& pattern_map = m.get_pattern_value_map();
+
+        auto valid_broadcast_target_shape = [](const std::vector<int32_t>& target_shape) {
+            return std::count_if(target_shape.begin(), target_shape.end(), [](int32_t s) { return s != 1; }) == 1;
+        };
+        auto broadcast_b = std::dynamic_pointer_cast<ov::op::v3::Broadcast>(pattern_map.at(broadcast_b_m).get_node_shared_ptr());
+        auto broadcast_c = std::dynamic_pointer_cast<ov::op::v3::Broadcast>(pattern_map.at(broadcast_c_m).get_node_shared_ptr());
+
+        std::vector<int32_t> target_shape_val_b;
+        auto target_shape_constant_b = std::dynamic_pointer_cast<ov::op::v0::Constant>(broadcast_c->get_input_node_shared_ptr(1));
+        if (target_shape_constant_b) {
+            target_shape_val_b = target_shape_constant_b->cast_vector<int32_t>();
+            if (!valid_broadcast_target_shape(target_shape_val_b)) {
+                return false;
+            }
+        }
+
+        std::vector<int32_t> target_shape_val_c;
+        auto target_shape_constant_c = std::dynamic_pointer_cast<ov::op::v0::Constant>(broadcast_b->get_input_node_shared_ptr(1));
+        if (target_shape_constant_c) {
+            target_shape_val_c = target_shape_constant_c->cast_vector<int32_t>();
+            if (!valid_broadcast_target_shape(target_shape_val_c)) {
+                return false;
+            }
+        }
+
+        // Expect the same broadcast rules for key and value inputs
+        if (target_shape_val_b != target_shape_val_c) {
+            return false;
+        }
+
+        auto input_a = pattern_map.at(input_a_m).get_node_shared_ptr();
+        auto input_b = pattern_map.at(input_b_m).get_node_shared_ptr();
+        auto input_c = pattern_map.at(input_c_m).get_node_shared_ptr();
+
+        auto sdpa = std::dynamic_pointer_cast<op::SDPA>(m.get_match_root());
+        auto order_a = sdpa->get_input0_transpose_order();
+        auto order_b = sdpa->get_input1_transpose_order();
+        auto order_c = sdpa->get_input2_transpose_order();
+        auto order_d = sdpa->get_output_transpose_order();
+
+        std::shared_ptr<op::SDPA> sdpa_new;
+        if (pattern_map.find(sdpa_without_attn_mask_m) != pattern_map.end()) {
+            sdpa_new = std::make_shared<op::SDPA>(input_a, input_b, input_c, order_a, order_b, order_c, order_d, sdpa->get_causal());
+        } else if (pattern_map.find(sdpa_with_attn_mask_m) != pattern_map.end()) {
+            auto attn_mask = sdpa->get_input_source_output(3);
+            sdpa_new = std::make_shared<op::SDPA>(input_a, input_b, input_c, attn_mask, order_a, order_b, order_c, order_d, sdpa->get_causal());
+        } else if (pattern_map.find(sdpa_with_attn_mask_and_scale_m) != pattern_map.end()) {
+            auto attn_mask = sdpa->get_input_source_output(3);
+            auto scale = sdpa->get_input_source_output(4);
+            sdpa_new = std::make_shared<op::SDPA>(input_a, input_b, input_c, attn_mask, scale, order_a, order_b, order_c, order_d, sdpa->get_causal());
+        }
+
+        sdpa_new->set_friendly_name(sdpa->get_friendly_name());
+        ov::copy_runtime_info(m.get_matched_nodes(), sdpa_new);
+        ov::replace_node(sdpa, sdpa_new);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(sdpa_m, "UnsqueezeBroadcastReshapeSDPAFusion");
+    this->register_matcher(m, callback);
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp
new file mode 100644
index 00000000000000..ede3ac16fb51b5
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+class UnsqueezeBroadcastReshapeSDPAFusion : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("UnsqueezeBroadcastReshapeSDPAFusion", "0");
+    UnsqueezeBroadcastReshapeSDPAFusion();
+};
+
+}   // namespace intel_gpu
+}   // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 0c690dfe7d6df1..5d8db18151cd4e 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -61,11 +61,12 @@
 #include "plugin/transformations/bcast_and_pad_zp_buffers.hpp"
 #include "transformations/common_optimizations/rms_fusion.hpp"
 #include "plugin/transformations/swiglu_fusion.hpp"
-#include "plugin/transformations/transpose_matmul_fusion.hpp"
+#include "plugin/transformations/transpose_fusion.hpp"
 #include "plugin/transformations/indirect_kv_cache.hpp"
 #include "plugin/transformations/convert_convolution.hpp"
 #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp"
 #include "transformations/common_optimizations/rms_fusion.hpp"
+#include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp"
 #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp"
 #include "transformations/common_optimizations/broadcast_transition.hpp"
 #include "transformations/common_optimizations/common_optimizations.hpp"
@@ -134,6 +135,7 @@
 #include "transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp"
 #include "transformations/op_conversions/softmax_decomposition.hpp"
 #include "transformations/op_conversions/softplus_decomposition.hpp"
+#include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp"
 #include "transformations/opset_conversions/convert_opset2_to_opset1.hpp"
 #include "transformations/opset_conversions/convert_opset3_to_opset2.hpp"
 #include "transformations/resolve_names_collisions.hpp"
@@ -303,6 +305,50 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
 
         manager.register_pass<ov::pass::CommonOptimizations>();
 
+        pass_config->set_callback<ov::pass::ScaledDotProductAttentionDecomposition>([&](const std::shared_ptr<const ov::Node> node){
+            if (!config.get_property(ov::intel_gpu::hint::enable_sdpa_optimization))
+                return false;
+
+            auto sdpa = std::dynamic_pointer_cast<const ov::op::v13::ScaledDotProductAttention>(node);
+            const auto& query_ps = sdpa->get_input_partial_shape(0);
+            const auto& key_ps = sdpa->get_input_partial_shape(1);
+            const auto& value_ps = sdpa->get_input_partial_shape(2);
+
+            // Known limitations:
+            // - SDPA impl could be slower in non-LLM scenarios than decomposed version
+            if (func->get_variables().size() == 0)
+                return false;
+
+            // - The data type of SDPA should be fp16
+            if (sdpa->get_output_element_type(0) != ov::element::f16)
+                return false;
+
+            // - The number of dimensions for each input is expected to be 4
+            if (query_ps.size() != 4 || key_ps.size() != 4 || value_ps.size() != 4) {
+                return false;
+            }
+
+            // - The head size of all Q, K, and V inputs should be the same static value
+            if (query_ps[query_ps.size() - 1].is_dynamic() || key_ps[key_ps.size() - 1].is_dynamic() || value_ps[query_ps.size() - 1].is_dynamic()) {
+                return false;
+            }
+
+            if (query_ps[query_ps.size() - 1].get_length() != key_ps[key_ps.size() - 1].get_length() ||
+                query_ps[query_ps.size() - 1].get_length() != value_ps[query_ps.size() - 1].get_length()) {
+                return false;
+            }
+
+            // - The head size should be divisible by 16
+            const auto optimal_subgroup_size = 16;
+            if (query_ps[query_ps.size() - 1].is_dynamic() ||
+                query_ps[query_ps.size() - 1].get_length() > 256 ||
+                query_ps[query_ps.size() - 1].get_length() % optimal_subgroup_size != 0) {
+                return false;
+            }
+
+            return true;
+        });
+
         manager.register_pass<ov::pass::WrapInterpolateIntoTransposes>();
         manager.register_pass<ov::pass::TransposeSinking>();
 
@@ -749,10 +795,17 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         manager.register_pass<ov::pass::RMSFusion>();
         manager.register_pass<ov::intel_gpu::KVCacheFusion>();
         manager.register_pass<ov::intel_gpu::FullyConnectedConvertFusion>();
+        manager.register_pass<ov::intel_gpu::TransposeFusion>();
+        if (device_info.supports_immad) {
+            manager.get_pass_config()->disable<ov::intel_gpu::TransposeMatMulMatcher>();
+            manager.get_pass_config()->disable<ov::intel_gpu::TransposeMatMulTransposeMatcher>();
+        }
+
         if (!device_info.supports_immad) {
-            manager.register_pass<ov::intel_gpu::TransposeMatMulFusion>();
             manager.register_pass<ov::intel_gpu::UnsqueezeBroadcastReshapeMatmulFusion>();
         }
+        manager.register_pass<ov::intel_gpu::UnsqueezeBroadcastReshapeSDPAFusion>();
+
         manager.register_pass<ov::intel_gpu::SwiGLUFusion>();
         manager.register_pass<ov::intel_gpu::IndirectKVCache>();
         manager.register_pass<ov::intel_gpu::ConvertConvolutionToInternal>();
diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
index 8a57759bff9413..66b8d3e70cab1f 100644
--- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp
+++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
@@ -50,6 +50,7 @@ void ExecutionConfig::set_default() {
         std::make_tuple(ov::intel_gpu::hint::host_task_priority, ov::hint::Priority::MEDIUM),
         std::make_tuple(ov::intel_gpu::hint::queue_throttle, ov::intel_gpu::hint::ThrottleLevel::MEDIUM),
         std::make_tuple(ov::intel_gpu::hint::queue_priority, ov::hint::Priority::MEDIUM),
+        std::make_tuple(ov::intel_gpu::hint::enable_sdpa_optimization, false),
         std::make_tuple(ov::intel_gpu::enable_loop_unrolling, true),
         std::make_tuple(ov::intel_gpu::disable_winograd_convolution, false),
         std::make_tuple(ov::internal::exclusive_async_requests, false),
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp
new file mode 100644
index 00000000000000..3b97cde5cfe636
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp
@@ -0,0 +1,248 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "common_test_utils/test_enums.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+
+#include "openvino/opsets/opset13.hpp"
+#include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp"
+#include "openvino/pass/manager.hpp"
+
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/result.hpp"
+#include "openvino/op/matmul.hpp"
+
+#include "intel_gpu/runtime/execution_config.hpp"
+
+namespace {
+using ov::test::InputShape;
+
+typedef std::tuple<ov::element::Type,                // netPrecision
+                   std::vector<InputShape>,          // shape
+                   bool,                             // is_causal
+                   bool,                             // has_attn
+                   bool,                             // has_scale
+                   std::string                       // targetDevice
+                   > ScaledAttnGPUTestParams;
+
+class ScaledAttnLayerGPUTest : public testing::WithParamInterface<ScaledAttnGPUTestParams>,
+                               virtual public ov::test::SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<ScaledAttnGPUTestParams>& obj);
+
+protected:
+    void SetUp() override;
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
+    bool is_causal;
+    bool has_attn;
+    bool has_scale;
+};
+
+std::string ScaledAttnLayerGPUTest::getTestCaseName(const testing::TestParamInfo<ScaledAttnGPUTestParams>& obj) {
+    ov::element::Type inType;
+    std::vector<InputShape> inputShapes;
+    bool is_causal;
+    bool has_attn;
+    bool has_scale;
+    std::string targetDevice;
+    std::tie(inType, inputShapes, is_causal, has_attn, has_scale, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "netPRC=" << inType << "_";
+    result << "IS=";
+    for (const auto& inputShape : inputShapes) {
+        result << ov::test::utils::partialShape2str({inputShape.first}) << "_";
+    }
+    result << "TS=";
+    for (const auto& shapes : inputShapes) {
+        for (const auto& shape : shapes.second) {
+            result << ov::test::utils::vec2str(shape);
+            result << "_";
+        }
+    }
+    result << "is_causal=" << is_causal << "_";
+    result << "has_attn=" << has_attn << "_";
+    result << "has_scale=" << has_scale << "_";
+    result << "trgDev=" << targetDevice;
+
+    return result.str();
+}
+
+void ScaledAttnLayerGPUTest::SetUp() {
+    ov::element::Type inType;
+    std::vector<InputShape> inputShapes;
+    std::tie(inType, inputShapes, is_causal, has_attn, has_scale, targetDevice) = this->GetParam();
+
+    init_input_shapes(inputShapes);
+    ov::ParameterVector inputParams;
+    // q, k, v
+    inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]));
+    inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[1]));
+    inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[1]));
+    inputParams[0]->set_friendly_name("q");
+    inputParams[1]->set_friendly_name("k");
+    inputParams[2]->set_friendly_name("v");
+    // special case: only scale but no attn
+    if (!has_attn && has_scale) {
+        // attention_mask：[1]
+        inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, ov::PartialShape{}));
+        inputParams.back()->set_friendly_name("attention_mask");
+        // scale：[1]
+        inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, ov::PartialShape{1}));
+        inputParams.back()->set_friendly_name("scale");
+    } else {
+        if (has_attn) {
+            inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[2]));
+            inputParams.back()->set_friendly_name("attention_mask");
+        }
+        if (has_scale) {
+            // scale：[1]
+            inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, ov::PartialShape{1}));
+            inputParams.back()->set_friendly_name("scale");
+        }
+    }
+
+    // Add artificial read/value operations to the model to trigger the enabling of the SDPA operation
+    auto read_key = std::make_shared<ov::op::v3::ReadValue>(inputParams.at(1), "v0");
+    auto assign_key = std::make_shared<ov::op::v3::Assign>(read_key, "v0");
+
+    auto read_value = std::make_shared<ov::op::v3::ReadValue>(inputParams.at(2), "v0");
+    auto assign_value = std::make_shared<ov::op::v3::Assign>(read_value, "v0");
+
+    ov::OutputVector inputs;
+    for (size_t i = 0; i < inputParams.size(); i++) {
+        if (i == 1)
+            inputs.push_back(read_key);
+        else if (i == 2)
+            inputs.push_back(read_value);
+        else
+            inputs.push_back(inputParams[i]);
+    }
+
+    auto sdp = std::make_shared<ov::opset13::ScaledDotProductAttention>(inputs, is_causal);
+    sdp->set_friendly_name("sdpa");
+
+    auto output = std::make_shared<ov::op::v0::Result>(sdp->output(0));
+
+    function = std::make_shared<ov::Model>(ov::OutputVector{output}, ov::SinkVector{assign_key, assign_value}, inputParams, "sdpa_model");
+
+    functionRefs = function->clone();
+    ov::pass::Manager manager;
+
+    // Decompose ScaledDotProductAttention
+    manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
+    manager.run_passes(functionRefs);
+
+    // Enable SDPA
+    configuration.insert(ov::intel_gpu::hint::enable_sdpa_optimization(true));
+
+    auto it = std::find_if(inputShapes[1].second.begin(), inputShapes[1].second.end(), [&](const ov::Shape& shape){
+        return shape[2] >= 384;
+    });
+
+    bool has_long_seq = it != inputShapes[1].second.end();
+    if (inType == ov::element::f16) {
+        if (has_long_seq) {
+            abs_threshold = 0.025;
+            rel_threshold = 0.025;
+        } else {
+            abs_threshold = 0.005;
+            rel_threshold = 0.005;
+        }
+    }
+}
+
+void ScaledAttnLayerGPUTest::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
+    std::vector<ov::Shape> shapes(3);
+    shapes[0] = targetInputStaticShapes[0];
+    shapes[1] = targetInputStaticShapes[1];
+    shapes[2] = targetInputStaticShapes[1];
+    if (!has_attn && has_scale) {
+        shapes.push_back(ov::Shape{});
+        shapes.push_back(ov::Shape{1});
+    } else {
+        if (has_attn) {
+            shapes.push_back(targetInputStaticShapes[2]);
+        }
+        if (has_scale) {
+            shapes.push_back(ov::Shape{1});
+        }
+    }
+    SubgraphBaseTest::generate_inputs(shapes);
+}
+
+TEST_P(ScaledAttnLayerGPUTest, CompareWithRefs) {
+    ov::element::Type inType;
+    std::vector<InputShape> inputShapes;
+    bool is_causal;
+    bool has_attn;
+    bool has_scale;
+    std::string targetDevice;
+    std::tie(inType, inputShapes, is_causal, has_attn, has_scale, targetDevice) = this->GetParam();
+    run();
+}
+
+const std::vector<std::vector<InputShape>> shapes{
+    // normal case, shapes of q,k,v are same
+    {
+        // q shape
+        {ov::test::InputShape{ov::PartialShape{-1, 8, -1, 64},
+            {ov::Shape{1, 8, 100, 64}, ov::Shape{1, 8, 1, 64}, ov::Shape{2, 8, 10, 64}}}
+        },
+        // kv shape
+        {ov::test::InputShape{ov::PartialShape{-1, 8, -1, 64},
+            {ov::Shape{1, 8, 100, 64}, ov::Shape{1, 8, 1, 64}, ov::Shape{2, 8, 10, 64}}}
+        },
+        // attn shape: [B, 1, -1, L0+L1]
+        {ov::test::InputShape{ov::PartialShape{-1, 1, -1, -1},
+            {ov::Shape{1, 1, 100, 100}, ov::Shape{1, 1, 1, 1}, ov::Shape{2, 1, 10, 10}}}
+        },
+    },
+    {
+        // q shape
+        {ov::test::InputShape{ov::PartialShape{-1, 5, -1, 64},
+            {ov::Shape{2, 5, 100, 64}, ov::Shape{2, 5, 1, 64}, ov::Shape{2, 5, 384, 64}}}
+        },
+        // kv shape
+        {ov::test::InputShape{ov::PartialShape{-1, 5, -1, 64},
+            {ov::Shape{2, 5, 100, 64}, ov::Shape{2, 5, 1, 64}, ov::Shape{2, 5, 384, 64}}}
+        },
+        // attn shape: [B, 1, -1, L0+L1]
+        {ov::test::InputShape{ov::PartialShape{-1, 1, -1, -1},
+            {ov::Shape{1, 1, 100, 100}, ov::Shape{1, 1, 1, 1}, ov::Shape{2, 1, 384, 384}}}
+        },
+    },
+    // heads number of kv is 1, attn mask: [B, H, L1, L0+L1]
+    {
+        // q shape
+        {ov::test::InputShape{ov::PartialShape{-1, 8, -1, 64},
+            {ov::Shape{1, 8, 100, 64}, ov::Shape{1, 8, 1, 64}, ov::Shape{2, 8, 10, 64}}}
+        },
+        // kv shape
+        {ov::test::InputShape{ov::PartialShape{-1, 1, -1, 64},
+            {ov::Shape{1, 1, 100, 64}, ov::Shape{1, 1, 1, 64}, ov::Shape{2, 1, 10, 64}}}
+        },
+        // attn shape
+        {ov::test::InputShape{ov::PartialShape{-1, 8, -1, -1},
+            {ov::Shape{1, 8, 100, 100}, ov::Shape{1, 8, 1, 1}, ov::Shape{2, 8, 10, 10}}}
+        },
+    },
+};
+
+const auto params = testing::Combine(testing::Values(ov::element::f16 /*, ov::element::f32 */),
+                                                 testing::ValuesIn(shapes),
+                                                 testing::Values(true, false),
+                                                 testing::Values(true, false),
+                                                 testing::Values(true, false),
+                                                 testing::Values(ov::test::utils::DEVICE_GPU));
+
+INSTANTIATE_TEST_SUITE_P(smoke_ScaledAttn_GPU,
+                         ScaledAttnLayerGPUTest,
+                         params,
+                         ScaledAttnLayerGPUTest::getTestCaseName);
+
+} // namespace
diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
index 3e6e89c870f625..8bff8f56c50156 100644
--- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
+++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
@@ -57,7 +57,7 @@ ov_set_threading_interface_for(${TARGET_NAME})
 
 # Workaround to avoid warnings during LTO build
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized")
+    set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized -Wno-stringop-overflow")
 endif()
 
 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
index 8ce9e294a867fe..180d8cdb036483 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
@@ -2527,150 +2527,302 @@ INSTANTIATE_TEST_SUITE_P(gemm_gpu, gemm_onednn_ndims, ::testing::ValuesIn(std::v
     gemm_onednn_test_params{ CASE_GEMM_ONEDNN_I8_6D },
 }));
 
-TEST(gemm_onednn, impl_replacement_with_cldnn) {
-    auto& engine = get_test_engine();
+class gemm_onednn: public ::testing::Test {
+public:
+    void test_impl_replacement_with_cldnn() {
+        auto& engine = get_test_engine();
 
-    if (!engine.get_device_info().supports_immad)
-        return;
+        if (!engine.get_device_info().supports_immad)
+            return;
+
+        ov::Shape in1_shape = { 1, 1, 3, 4 };
+        ov::Shape in2_shape = { 1, 4 };
+        auto in1_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::f32, format::bfyx};
+        auto in2_layout = layout{ov::PartialShape::dynamic(in2_shape.size()), data_types::f32, format::bfyx};
+        auto input1 = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f32, format::bfyx});
+        auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx});
+
+        std::vector<float> input1_data = {
+            1.f, -2.f, 3.f, -4.f,
+            5.f, 6.f, 1.f, 2.f,
+            3.f, 3.f, 2.f, -1.f,
+        };
+
+        std::vector<float> input2_data = {
+            2.f, 5.f, -4.f, -7.f,
+        };
+        set_values(input1, input1_data);
+        set_values(input2, input2_data);
+
+        std::vector<float> out_data = {
+            8.f, 22.f, 20.f
+        };
+
+        topology topology;
+        topology.add(input_layout("input1", in1_layout),
+                     input_layout("input2", in2_layout),
+                     gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, true, 1.0f, 0.0f, 4, 2)
+        );
+
+        ov::intel_gpu::ImplementationDesc fc_impl = { format::bfyx, "", impl_types::onednn };
+        ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order),
+                             ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", fc_impl} }),
+                             ov::intel_gpu::optimize_data(true),
+                             ov::intel_gpu::allow_new_shape_infer(true) };
+
+        network network(engine, topology, cfg);
+        network.set_input_data("input1", input1);
+        network.set_input_data("input2", input2);
+
+        auto inst = network.get_primitive("gemm");
+        auto impl = inst->get_impl();
+        ASSERT_TRUE(impl != nullptr);
+        ASSERT_TRUE(impl->is_dynamic());
+
+        auto outputs = network.execute();
+
+        auto output = outputs.at("gemm").get_memory();
+        cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+        ASSERT_EQ(output_ptr.size(), (uint32_t)3);
+        for (uint32_t i = 0; i < out_data.size(); ++i) {
+            ASSERT_FLOAT_EQ(output_ptr[i], out_data[i]);
+        }
+
+        // WA: Call wait_all() to wait for all queued kernels compilation finish
+        network.get_program()->get_compilation_context().wait_all();
 
-    ov::Shape in1_shape = { 1, 1, 3, 4 };
-    ov::Shape in2_shape = { 1, 4 };
-    auto in1_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::f32, format::bfyx};
-    auto in2_layout = layout{ov::PartialShape::dynamic(in2_shape.size()), data_types::f32, format::bfyx};
-    auto input1 = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f32, format::bfyx});
-    auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx});
-
-    std::vector<float> input1_data = {
-        1.f, -2.f, 3.f, -4.f,
-        5.f, 6.f, 1.f, 2.f,
-        3.f, 3.f, 2.f, -1.f,
-    };
-
-    std::vector<float> input2_data = {
-        2.f, 5.f, -4.f, -7.f,
-    };
-    set_values(input1, input1_data);
-    set_values(input2, input2_data);
-
-    std::vector<float> out_data = {
-        8.f, 22.f, 20.f
-    };
-
-    topology topology;
-    topology.add(input_layout("input1", in1_layout),
-                 input_layout("input2", in2_layout),
-                 gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, true, 1.0f, 0.0f, 4, 2)
-    );
-
-    ov::intel_gpu::ImplementationDesc fc_impl = { format::bfyx, "", impl_types::onednn };
-    ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order),
-                         ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", fc_impl} }),
-                         ov::intel_gpu::optimize_data(true),
-                         ov::intel_gpu::allow_new_shape_infer(true) };
-
-    network network(engine, topology, cfg);
-    network.set_input_data("input1", input1);
-    network.set_input_data("input2", input2);
-
-    auto inst = network.get_primitive("gemm");
-    auto impl = inst->get_impl();
-    ASSERT_TRUE(impl != nullptr);
-    ASSERT_TRUE(impl->is_dynamic());
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("gemm").get_memory();
-    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
-
-    ASSERT_EQ(output_ptr.size(), (uint32_t)3);
-    for (uint32_t i = 0; i < out_data.size(); ++i) {
-        ASSERT_FLOAT_EQ(output_ptr[i], out_data[i]);
+        // Check if OneDNN's impl is used for the next execute() call
+        network.execute();
+        inst = network.get_primitive("gemm");
+        impl = inst->get_impl();
+        ASSERT_TRUE(impl != nullptr);
+        ASSERT_FALSE(impl->is_dynamic());
     }
 
-    // WA: Call wait_all() to wait for all queued kernels compilation finish
-    network.get_program()->get_compilation_context().wait_all();
+    void test_check_transpose_format(const std::vector<uint16_t>& permute_order) {
+        auto& engine = get_test_engine();
+        tests::random_generator rg;
+        rg.set_seed(GET_SUITE_NAME);
 
-    // Check if OneDNN's impl is used for the next execute() call
-    network.execute();
-    inst = network.get_primitive("gemm");
-    impl = inst->get_impl();
-    ASSERT_TRUE(impl != nullptr);
-    ASSERT_FALSE(impl->is_dynamic());
-}
+        if (!engine.get_device_info().supports_immad)
+            return;
 
-// Check gemm_onednn transpose_format() can accept transpose white list format (byfx/bxfy)
-TEST(gemm_onednn, check_transpose_format_byfx) {
-    auto& engine = get_test_engine();
-    tests::random_generator rg;
-    rg.set_seed(GET_SUITE_NAME);
+        auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } });
+        auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } });
 
-    if (!engine.get_device_info().supports_immad)
-        return;
+        topology topology;
+        topology.add(input_layout("input0", input0->get_layout()));
+        topology.add(permute("permute0", input_info("input0"), permute_order));
+        topology.add(input_layout("input1", input1->get_layout()));
+        topology.add(permute("permute1", input_info("input1"), permute_order));
+        topology.add(gemm("gemm", { input_info("permute0"), input_info("permute1") }, data_types::f16, false, true));
+
+        ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::onednn };
+        ExecutionConfig config{ ov::intel_gpu::queue_type(QueueTypes::in_order),
+                                ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", impl} }),
+                                ov::intel_gpu::optimize_data(true),
+                                ov::intel_gpu::allow_new_shape_infer(false) };
+        network network(engine, topology, config);
+
+        auto input0_data = rg.generate_random_1d<ov::float16>(input0->get_layout().count(), -1, 1);
+        auto input1_data = rg.generate_random_1d<ov::float16>(input1->get_layout().count(), -1, 1);
+
+        set_values(input0, input0_data);
+        set_values(input1, input1_data);
 
-    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } });
-    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } });
+        network.set_input_data("input0", input0);
+        network.set_input_data("input1", input1);
 
-    topology topology;
-    topology.add(input_layout("input0", input0->get_layout()));
-    topology.add(permute("permute0", input_info("input0"), {0, 2, 1, 3}));
-    topology.add(input_layout("input1", input1->get_layout()));
-    topology.add(permute("permute1", input_info("input1"), {0, 2, 1, 3}));
-    topology.add(gemm("gemm", { input_info("permute0"), input_info("permute1") }, data_types::f16, false, true));
+        ASSERT_NO_FATAL_FAILURE(network.execute());
+    }
 
-    ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::onednn };
-    ExecutionConfig config{ ov::intel_gpu::queue_type(QueueTypes::in_order),
-                         ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", impl} }),
-                         ov::intel_gpu::optimize_data(true),
-                         ov::intel_gpu::allow_new_shape_infer(false) };
-    network network(engine, topology, config);
+    void test_dynamic_padding(bool n_dim_only) {
+        tests::random_generator rg;
+        rg.set_seed(GET_SUITE_NAME);
 
-    auto input0_data = rg.generate_random_1d<ov::float16>(input0->get_layout().count(), -1, 1);
-    auto input1_data = rg.generate_random_1d<ov::float16>(input1->get_layout().count(), -1, 1);
+        auto& engine = get_test_engine();
 
-    set_values(input0, input0_data);
-    set_values(input1, input1_data);
+        if (!engine.get_device_info().supports_immad)
+            return;
 
-    network.set_input_data("input0", input0);
-    network.set_input_data("input1", input1);
+        const unsigned long BATCH_SIZE = 31;
+        const unsigned long M_SIZE = 11;
+        const unsigned long K_SIZE = 37;
+        const unsigned long N_SIZE = 49;
 
-    ASSERT_NO_FATAL_FAILURE(network.execute()); 
-}
+        auto fill_mem = [&](cldnn::memory_ptr mem, std::vector<ov::float16>& data) {
+            cldnn::mem_lock<ov::float16> mem_ptr(mem, get_test_stream());
+            auto&& l = mem->get_layout();
+            auto data_idx = 0;
+            for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) {
+                for (cldnn::tensor::value_type f = 0; f < l.feature(); ++f) {
+                    for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) {
+                        for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) {
+                            auto tensor_coord = cldnn::tensor{{b, f, x, y}, 0};
+                            auto buffer_idx = l.get_linear_offset(tensor_coord);
+                            mem_ptr[buffer_idx] = data[data_idx++];
+                        }
+                    }
+                }
+            }
+        };
 
-TEST(gemm_onednn, check_transpose_format_bxfy) {
-    auto& engine = get_test_engine();
-    tests::random_generator rg;
-    rg.set_seed(GET_SUITE_NAME);
+        const auto align_size_m = 13;
+        const auto align_size_k = 16;
+        const auto align_size_n = 15;
+        const auto align_size_b1 = 3;
+        const auto align_size_b2 = 19;
 
-    if (!engine.get_device_info().supports_immad)
-        return;
+        const auto aligned_batch1_size = align_to(1ul, align_size_b1);
+        auto padding_size_batch1 = static_cast<int>(aligned_batch1_size - 1);
 
-    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } });
-    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } });
+        const auto aligned_batch2_size = align_to(BATCH_SIZE, align_size_b2);
+        auto padding_size_batch2 = static_cast<int>(aligned_batch2_size - BATCH_SIZE);
+
+        const auto aligned_m_size = align_to(M_SIZE, align_size_m);
+        auto padding_size_m = static_cast<int>(aligned_m_size - M_SIZE);
+        const auto aligned_k_size = align_to(K_SIZE, align_size_k);
+        auto padding_size_k = static_cast<int>(aligned_k_size - K_SIZE);
+        const auto aligned_n_size = align_to(N_SIZE, align_size_n);
+        auto padding_size_n = static_cast<int>(aligned_n_size - N_SIZE);
+
+        ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE };
+        ov::Shape in2_shape = { 1, BATCH_SIZE, K_SIZE, N_SIZE };
+        ov::Shape in1_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_m_size, aligned_k_size };
+        ov::Shape in2_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_k_size, aligned_n_size };
+
+        // Use dynamic padding for all BFYX dimensions
+        tensor dyn_pad_dims_input1({0, 0, 0, 0}, 0);
+        tensor dyn_pad_dims_input2({0, 0, 0, 0}, 0);
+
+        if (n_dim_only) {
+            dyn_pad_dims_input1 = tensor({0, 0, 0, 0}, 0);
+            dyn_pad_dims_input2 = tensor({0, 0, 1, 0}, 0);
+        } else {
+            dyn_pad_dims_input1 = tensor({1, 1, 1, 1}, 0);
+            dyn_pad_dims_input2 = tensor({1, 1, 1, 1}, 0);
+        }
+
+        auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1)};
+        auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input2)};
+
+        auto aligned_input1_mem = engine.allocate_memory({ov::PartialShape(in1_shape_aligned), data_types::f16, format::bfyx});
+        auto aligned_input2_mem = engine.allocate_memory({ov::PartialShape(in2_shape_aligned), data_types::f16, format::bfyx});
+
+        auto input1_mem = engine.reinterpret_buffer(*aligned_input1_mem, layout{ov::PartialShape(in1_shape),
+                                                                                data_types::f16,
+                                                                                format::bfyx,
+                                                                                n_dim_only ? padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1) :
+                                                                                             padding({0, 0, 0, 0}, {padding_size_batch1, padding_size_batch2, padding_size_k, padding_size_m}, 0.0f, dyn_pad_dims_input1)});
+
+        auto input2_mem = engine.reinterpret_buffer(*aligned_input2_mem, layout{ov::PartialShape(in2_shape),
+                                                                                data_types::f16,
+                                                                                format::bfyx,
+                                                                                n_dim_only ? padding({0, 0, 0, 0}, {0, 0, padding_size_n, 0}, 0.0f, dyn_pad_dims_input2) :
+                                                                                             padding({0, 0, 0, 0}, {padding_size_batch1, padding_size_batch2, padding_size_n, padding_size_k}, 0.0f, dyn_pad_dims_input2)});
+
+        auto input_1_data = rg.generate_random_1d<ov::float16>(ov::shape_size(in1_shape), -2, 2);
+        auto input_2_data = rg.generate_random_1d<ov::float16>(ov::shape_size(in2_shape), -2, 2);
+
+        fill_mem(input1_mem, input_1_data);
+        fill_mem(input2_mem, input_2_data);
+
+        auto get_ref_results = [&]() {
+            ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE };
+            ov::Shape in2_shape = { 1, BATCH_SIZE, K_SIZE, N_SIZE };
+            auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx};
+            auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx};
+
+            auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f16, format::bfyx});
+            auto input2_mem = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f16, format::bfyx});
+
+            fill_mem(input1_mem, input_1_data);
+            fill_mem(input2_mem, input_2_data);
+
+            topology topology;
+            topology.add(input_layout("input1", in1_layout),
+                         input_layout("input2", in2_layout),
+                         gemm("gemm_ref", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4),
+                         permute("permute", input_info("gemm_ref"), {0, 2, 1, 3}),
+                         reorder("reorder", input_info("permute"), format::bfyx, data_types::f32)
+            );
+
+            ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, std::string(""), impl_types::onednn };
+            ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order),
+                                 ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", gemm_impl} }),
+                                 ov::intel_gpu::optimize_data(true),
+                                 ov::intel_gpu::allow_new_shape_infer(true) };
+
+            network network(engine, topology, cfg);
+            network.set_input_data("input1", input1_mem);
+            network.set_input_data("input2", input2_mem);
+
+            auto outputs = network.execute();
+            OPENVINO_ASSERT(outputs.size() == 1);
+            OPENVINO_ASSERT(outputs.begin()->first == "reorder");
+
+            auto inst = network.get_primitive("reorder");
+
+            auto output_mem = outputs.at("reorder").get_memory();
+            auto output_layout = outputs.at("reorder").get_layout();
+
+            return engine.reinterpret_buffer(*output_mem, output_layout);
+        };
 
-    topology topology;
-    topology.add(input_layout("input0", input0->get_layout()));
-    topology.add(permute("permute0", input_info("input0"), {0, 3, 1, 2}));
-    topology.add(input_layout("input1", input1->get_layout()));
-    topology.add(permute("permute1", input_info("input1"), {0, 3, 1, 2}));
-    topology.add(gemm("gemm", { input_info("permute0"), input_info("permute1") }, data_types::f16, false, true));
+        topology topology;
+        topology.add(input_layout("input1", in1_layout),
+                     input_layout("input2", in2_layout),
+                     gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4),
+                     permute("permute", input_info("gemm"), {0, 2, 1, 3}),
+                     reorder("reorder", input_info("permute"), format::bfyx, data_types::f32)
+        );
+
+        ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, std::string(""), impl_types::onednn };
+        ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order),
+                             ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", gemm_impl} }),
+                             ov::intel_gpu::optimize_data(true),
+                             ov::intel_gpu::allow_new_shape_infer(true) };
+        network network(engine, topology, cfg);
+        network.set_input_data("input1", input1_mem);
+        network.set_input_data("input2", input2_mem);
 
-    ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::onednn };
-    ExecutionConfig config{ ov::intel_gpu::queue_type(QueueTypes::in_order),
-                         ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", impl} }),
-                         ov::intel_gpu::optimize_data(true),
-                         ov::intel_gpu::allow_new_shape_infer(false) };
-    network network(engine, topology, config);
+        auto outputs = network.execute();
 
-    auto input0_data = rg.generate_random_1d<ov::float16>(input0->get_layout().count(), -1, 1);
-    auto input1_data = rg.generate_random_1d<ov::float16>(input1->get_layout().count(), -1, 1);
+        auto output_mem = outputs.at("reorder").get_memory();
+        auto output_layout = outputs.at("reorder").get_layout();
 
-    set_values(input0, input0_data);
-    set_values(input1, input1_data);
+        auto res = engine.reinterpret_buffer(*output_mem, output_layout);
 
-    network.set_input_data("input0", input0);
-    network.set_input_data("input1", input1);
+        auto ref_res = get_ref_results();
+
+        mem_lock<ov::float16> res_lock(res, get_test_stream());
+        mem_lock<ov::float16> res_ref_lock(ref_res, get_test_stream());
+        for (size_t i = 0; i < res->count(); i++) {
+            ASSERT_EQ(res_lock[i], res_ref_lock[i]) << i;
+        }
+    }
+};
+
+TEST_F(gemm_onednn, impl_replacement_with_cldnn) {
+    this->test_impl_replacement_with_cldnn();
+}
+
+// Check gemm_onednn transpose_format() can accept transpose white list format (byfx/bxfy)
+TEST_F(gemm_onednn, check_transpose_format_byfx) {
+    this->test_check_transpose_format({0, 2, 1, 3});
+}
+
+TEST_F(gemm_onednn, check_transpose_format_bxfy) {
+    this->test_check_transpose_format({0, 3, 1, 2});
+}
+
+TEST_F(gemm_onednn, dynamic_padding_all_dim) {
+    this->test_dynamic_padding(false);
+}
 
-    ASSERT_NO_FATAL_FAILURE(network.execute()); 
+TEST_F(gemm_onednn, dynamic_padding_n_dim_only) {
+    this->test_dynamic_padding(true);
 }
 
 template <typename gemm_params, typename input0_type, typename input1_type, typename input2_type, typename output_type, typename accumulator_type>
diff --git a/src/plugins/intel_gpu/tests/unit/transformations/transpose_matmul_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/transpose_matmul_fusion_test.cpp
index 61638930c3b63f..f97ac8f9c433a1 100644
--- a/src/plugins/intel_gpu/tests/unit/transformations/transpose_matmul_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/transformations/transpose_matmul_fusion_test.cpp
@@ -13,7 +13,7 @@
 #include "openvino/op/result.hpp"
 #include "intel_gpu/op/gemm.hpp"
 
-#include "plugin/transformations/transpose_matmul_fusion.hpp"
+#include "plugin/transformations/transpose_fusion.hpp"
 
 #include <memory>
 
@@ -31,7 +31,7 @@ TEST_F(TransformationTestsF, TranposeMatmulFusion1) {
         auto matmul = std::make_shared<ov::op::v0::MatMul>(input_a, input_b);
 
         model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input_a, input_b });
-        manager.register_pass<TransposeMatMulFusion>();
+        manager.register_pass<TransposeFusion>();
     }
     {
         std::vector<int64_t> order_a = {0, 1, 2, 3};
@@ -55,7 +55,7 @@ TEST_F(TransformationTestsF, TranposeMatmulFusion2) {
         auto matmul = std::make_shared<ov::op::v0::MatMul>(tranpose_a, input_b);
 
         model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input_a, input_b });
-        manager.register_pass<TransposeMatMulFusion>();
+        manager.register_pass<TransposeFusion>();
     }
     {
         std::vector<int64_t> order_a = {0, 2, 1, 3};
@@ -81,7 +81,7 @@ TEST_F(TransformationTestsF, TranposeMatmulFusion3) {
         auto matmul = std::make_shared<ov::op::v0::MatMul>(tranpose_a, tranpose_b);
 
         model = std::make_shared<ov::Model>(ov::NodeVector{ matmul }, ov::ParameterVector{ input_a, input_b });
-        manager.register_pass<TransposeMatMulFusion>();
+        manager.register_pass<TransposeFusion>();
     }
     {
         std::vector<int64_t> order_a = {0, 2, 1, 3};
@@ -109,7 +109,7 @@ TEST_F(TransformationTestsF, TranposeMatmulFusion4) {
         auto tranpose_c = std::make_shared<ov::op::v1::Transpose>(matmul, tranpose_c_const);
 
         model = std::make_shared<ov::Model>(ov::NodeVector{ tranpose_c }, ov::ParameterVector{ input_a, input_b });
-        manager.register_pass<TransposeMatMulFusion>();
+        manager.register_pass<TransposeFusion>();
     }
     {
         std::vector<int64_t> order_a = {0, 2, 1, 3};
diff --git a/src/plugins/intel_gpu/tests/unit/transformations/transpose_sdpa_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/transpose_sdpa_fusion_test.cpp
new file mode 100644
index 00000000000000..ebe15f4d806b31
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/transformations/transpose_sdpa_fusion_test.cpp
@@ -0,0 +1,178 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/ov_test_utils.hpp"
+
+#include "openvino/core/model.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+#include "openvino/pass/manager.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/result.hpp"
+#include "intel_gpu/op/sdpa.hpp"
+
+#include "plugin/transformations/transpose_fusion.hpp"
+
+#include <memory>
+
+using namespace testing;
+using namespace ov::intel_gpu;
+
+namespace ov {
+namespace test {
+namespace intel_gpu {
+
+TEST_F(TransformationTestsF, TranposeSDPAFusion1) {
+    {
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(input_a, input_b, input_c, true);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        manager.register_pass<TransposeFusion>();
+    }
+    {
+        std::vector<int64_t> order_a = {0, 1, 2, 3};
+        std::vector<int64_t> order_b = {0, 1, 2, 3};
+        std::vector<int64_t> order_c = {0, 1, 2, 3};
+        std::vector<int64_t> order_output = {0, 1, 2, 3};
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto sdpa = std::make_shared<ov::intel_gpu::op::SDPA>(input_a, input_b, input_c, order_a, order_b, order_c, order_output, true, ov::element::undefined );
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        comparator.enable(FunctionsComparator::ATTRIBUTES);
+    }
+}
+
+TEST_F(TransformationTestsF, TranposeSDPAFusion2) {
+    {
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3});
+        auto tranpose_a = std::make_shared<ov::op::v1::Transpose>(input_a, tranpose_a_const);
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(tranpose_a, input_b, input_c, true);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        manager.register_pass<TransposeFusion>();
+    }
+    {
+        std::vector<int64_t> order_a = {0, 2, 1, 3};
+        std::vector<int64_t> order_b = {0, 1, 2, 3};
+        std::vector<int64_t> order_c = {0, 1, 2, 3};
+        std::vector<int64_t> order_output = {0, 1, 2, 3};
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto sdpa = std::make_shared<ov::intel_gpu::op::SDPA>(input_a, input_b, input_c, order_a, order_b, order_c, order_output, true, ov::element::undefined);
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        comparator.enable(FunctionsComparator::ATTRIBUTES);
+    }
+}
+
+TEST_F(TransformationTestsF, TranposeSDPAFusion3) {
+    {
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3});
+        auto tranpose_a = std::make_shared<ov::op::v1::Transpose>(input_a, tranpose_a_const);
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 2, 0, 3});
+        auto tranpose_b = std::make_shared<ov::op::v1::Transpose>(input_b, tranpose_b_const);
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(tranpose_a, tranpose_b, input_c, false);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        manager.register_pass<TransposeFusion>();
+    }
+    {
+        std::vector<int64_t> order_a = {0, 2, 1, 3};
+        std::vector<int64_t> order_b = {1, 2, 0, 3};
+        std::vector<int64_t> order_c = {0, 1, 2, 3};
+        std::vector<int64_t> order_output = {0, 1, 2, 3};
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto sdpa = std::make_shared<ov::intel_gpu::op::SDPA>(input_a, input_b, input_c, order_a, order_b, order_c, order_output, false, ov::element::undefined);
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        comparator.enable(FunctionsComparator::ATTRIBUTES);
+    }
+}
+
+TEST_F(TransformationTestsF, TranposeSDPAFusion4) {
+    {
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3});
+        auto tranpose_a = std::make_shared<ov::op::v1::Transpose>(input_a, tranpose_a_const);
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3});
+        auto tranpose_b = std::make_shared<ov::op::v1::Transpose>(input_b, tranpose_b_const);
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_c_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3});
+        auto tranpose_c = std::make_shared<ov::op::v1::Transpose>(input_c, tranpose_c_const);
+
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(tranpose_a, tranpose_b, tranpose_c, false);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        manager.register_pass<TransposeFusion>();
+    }
+    {
+        std::vector<int64_t> order_a = {0, 2, 1, 3};
+        std::vector<int64_t> order_b = {0, 2, 1, 3};
+        std::vector<int64_t> order_c = {0, 2, 1, 3};
+        std::vector<int64_t> order_output = {0, 1, 2, 3};
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto sdpa = std::make_shared<ov::intel_gpu::op::SDPA>(input_a, input_b, input_c, order_a, order_b, order_c, order_output, false, ov::element::undefined);
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        comparator.enable(FunctionsComparator::ATTRIBUTES);
+    }
+}
+
+TEST_F(TransformationTestsF, TranposeSDPAFusion5) {
+    {
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3});
+        auto tranpose_a = std::make_shared<ov::op::v1::Transpose>(input_a, tranpose_a_const);
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3});
+        auto tranpose_b = std::make_shared<ov::op::v1::Transpose>(input_b, tranpose_b_const);
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_c_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {3, 2, 1, 0});
+        auto tranpose_c = std::make_shared<ov::op::v1::Transpose>(input_c, tranpose_c_const);
+
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(tranpose_a, tranpose_b, tranpose_c, false);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        manager.register_pass<TransposeFusion>();
+    }
+    {
+        auto input_a = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3});
+        auto tranpose_a = std::make_shared<ov::op::v1::Transpose>(input_a, tranpose_a_const);
+        auto input_b = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3});
+        auto tranpose_b = std::make_shared<ov::op::v1::Transpose>(input_b, tranpose_b_const);
+        auto input_c = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic(4));
+        auto tranpose_c_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {3, 2, 1, 0});
+        auto tranpose_c = std::make_shared<ov::op::v1::Transpose>(input_c, tranpose_c_const);
+
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(tranpose_a, tranpose_b, tranpose_c, false);
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c });
+        comparator.enable(FunctionsComparator::ATTRIBUTES);
+    }
+}
+
+}  // namespace intel_gpu
+}  // namespace test
+}  // namespace ov
diff --git a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp
index 587d17be09adcb..7f4524ec8127ca 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp
@@ -7,6 +7,7 @@
 #include <map>
 #include <memory>
 
+#include "intel_npu/utils/logger/logger.hpp"
 #include "npu.hpp"
 #include "zero_init.hpp"
 
@@ -30,6 +31,7 @@ class ZeroEngineBackend final : public IEngineBackend {
     std::shared_ptr<ZeroInitStructsHolder> _instance;
 
     std::map<std::string, std::shared_ptr<IDevice>> _devices{};
+    Logger _logger;
 };
 
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp
index 0cbd12e91878f2..94a87ab725dae6 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp
@@ -11,13 +11,15 @@
 
 namespace intel_npu {
 
-ZeroEngineBackend::ZeroEngineBackend(const Config& config) {
+ZeroEngineBackend::ZeroEngineBackend(const Config& config) : _logger("ZeroEngineBackend", config.get<LOG_LEVEL>()) {
+    _logger.debug("ZeroEngineBackend - initialize started");
     Logger::global().setLevel(config.get<LOG_LEVEL>());
 
     _instance = std::make_shared<ZeroInitStructsHolder>();
 
     auto device = std::make_shared<ZeroDevice>(_instance);
     _devices.emplace(std::make_pair(device->getName(), device));
+    _logger.debug("ZeroEngineBackend - initialize completed");
 }
 
 uint32_t ZeroEngineBackend::getDriverVersion() const {
@@ -36,8 +38,10 @@ ZeroEngineBackend::~ZeroEngineBackend() = default;
 
 const std::shared_ptr<IDevice> ZeroEngineBackend::getDevice() const {
     if (_devices.empty()) {
+        _logger.debug("ZeroEngineBackend - getDevice() returning empty list");
         return {};
     } else {
+        _logger.debug("ZeroEngineBackend - getDevice() returning device list");
         return _devices.begin()->second;
     }
 }
@@ -48,10 +52,12 @@ const std::shared_ptr<IDevice> ZeroEngineBackend::getDevice(const std::string& /
 }
 
 const std::vector<std::string> ZeroEngineBackend::getDeviceNames() const {
+    _logger.debug("ZeroEngineBackend - getDeviceNames started");
     std::vector<std::string> devicesNames;
     std::for_each(_devices.cbegin(), _devices.cend(), [&devicesNames](const auto& device) {
         devicesNames.push_back(device.first);
     });
+    _logger.debug("ZeroEngineBackend - getDeviceNames completed and returning result");
     return devicesNames;
 }
 
diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp
index cfa0cdaef34713..a29261bffe7d65 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp
@@ -16,6 +16,7 @@ ZeroDevice::ZeroDevice(const std::shared_ptr<ZeroInitStructsHolder>& initStructs
     : _initStructs(initStructs),
       _graph_ddi_table_ext(_initStructs->getGraphDdiTable()),
       log("ZeroDevice", Logger::global().level()) {
+    log.debug("ZeroDevice::ZeroDevice init");
     device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
     zeroUtils::throwOnFail("zeDeviceGetProperties",
                            zeDeviceGetProperties(_initStructs->getDevice(), &device_properties));
@@ -70,6 +71,7 @@ ZeroDevice::ZeroDevice(const std::shared_ptr<ZeroInitStructsHolder>& initStructs
         "zeDeviceGetCommandQueueGroupProperties",
         zeDeviceGetCommandQueueGroupProperties(_initStructs->getDevice(), &command_queue_group_count, nullptr));
 
+    log.debug("ZeroDevice::ZeroDevice - resize command_queue_group_count");
     command_group_properties.resize(command_queue_group_count);
 
     for (auto& prop : command_group_properties) {
@@ -83,7 +85,9 @@ ZeroDevice::ZeroDevice(const std::shared_ptr<ZeroInitStructsHolder>& initStructs
                                                                   command_group_properties.data()));
 
     // Find the corresponding command queue group.
+    log.debug("ZeroDevice::ZeroDevice - findGroupOrdinal");
     _group_ordinal = zeroUtils::findGroupOrdinal(command_group_properties, device_properties);
+    log.debug("ZeroDevice::ZeroDevice - init completed");
 }
 
 std::shared_ptr<IExecutor> ZeroDevice::createExecutor(
diff --git a/src/plugins/intel_npu/src/backend/src/zero_executor.cpp b/src/plugins/intel_npu/src/backend/src/zero_executor.cpp
index 468fbf6d95c761..4882a552155883 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_executor.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_executor.cpp
@@ -44,19 +44,23 @@ ZeroExecutor::ZeroExecutor(const std::shared_ptr<const ZeroInitStructsHolder>& i
                                                       zeroUtils::toZeQueuePriority(_config.get<MODEL_PRIORITY>()),
                                                       _config,
                                                       group_ordinal)}} {
+    _logger.debug("ZeroExecutor::ZeroExecutor - create graph_command_list");
     OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Executor::ZeroExecutor");
     CommandList graph_command_list(_initStructs->getDevice(),
                                    _initStructs->getContext(),
                                    _initStructs->getGraphDdiTable(),
                                    _config,
                                    _group_ordinal);
+    _logger.debug("ZeroExecutor::ZeroExecutor - create graph_command_queue");
     CommandQueue graph_command_queue(_initStructs->getDevice(),
                                      _initStructs->getContext(),
                                      ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
                                      _config,
                                      _group_ordinal);
+    _logger.debug("ZeroExecutor::ZeroExecutor - create fence");
     Fence fence(graph_command_queue, _config);
 
+    _logger.debug("ZeroExecutor::ZeroExecutor - create graph");
     OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_GRAPH, itt::domains::LevelZeroBackend, "Executor::ZeroExecutor", "graphCreate");
 
     ze_graph_desc_t desc{ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
@@ -79,6 +83,7 @@ ZeroExecutor::ZeroExecutor(const std::shared_ptr<const ZeroInitStructsHolder>& i
     }
 
     OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "pfnGetArgumentProperties3");
+    _logger.debug("ZeroExecutor::ZeroExecutor - performing pfnGetArgumentProperties3");
     for (uint32_t index = 0; index < _props.numGraphArgs; ++index) {
         ze_graph_argument_properties_3_t arg3;
         zeroUtils::throwOnFail("pfnGetArgumentProperties3",
@@ -104,12 +109,17 @@ ZeroExecutor::ZeroExecutor(const std::shared_ptr<const ZeroInitStructsHolder>& i
     }
 
     OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "appendGraphInitialize");
+    _logger.debug("ZeroExecutor::ZeroExecutor - performing appendGraphInitialize");
     graph_command_list.appendGraphInitialize(_graph);
+    _logger.debug("ZeroExecutor::ZeroExecutor - closing graph command list");
     graph_command_list.close();
 
     OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "queue_execute");
+    _logger.debug("ZeroExecutor::ZeroExecutor - performing executeCommandList");
     graph_command_queue.executeCommandList(graph_command_list, fence);
+    _logger.debug("ZeroExecutor::ZeroExecutor - performing hostSynchronize");
     fence.hostSynchronize();
+    _logger.debug("ZeroExecutor::ZeroExecutor - hostSynchronize completed");
 }
 
 void ZeroExecutor::setArgumentValue(uint32_t argi_, const void* argv_) const {
diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
index 66b3e43017237c..b03981e0448769 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -155,6 +155,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
       _profilingQuery(0,
                       _executor->getInitStructs()->getDevice(),
                       _executor->getInitStructs()->getProfilingDdiTable()) {
+    _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest");
     const std::unordered_map<std::string, ZeroExecutor::ArgumentDescriptor>& executorInputDescriptors =
         _executor->inputs_desc_map();
     const std::unordered_map<std::string, ZeroExecutor::ArgumentDescriptor>& executorOutputDescriptors =
@@ -162,6 +163,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
 
     auto proftype = config.get<PROFILING_TYPE>();
     if (proftype == ov::intel_npu::ProfilingType::INFER) {
+        _logger.debug("ZeroInferRequest::ZeroInferRequest - profiling type == ov::intel_npu::ProfilingType::INFER");
         _npuProfiling = std::make_shared<zeroProfiling::NpuInferProfiling>(_executor->getInitStructs()->getContext(),
                                                                            _executor->getInitStructs()->getDevice(),
                                                                            _config.get<LOG_LEVEL>());
@@ -178,6 +180,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
 
     auto allocator = zeroMemory::HostMemAllocator(backendPtr);
 
+    _logger.debug("ZeroInferRequest::ZeroInferRequest - performing I/O buffer allocation using Level Zero API");
     for (const std::string& inputName : _metadata.inputNames) {
         if (!executorInputDescriptors.count(inputName)) {
             OPENVINO_THROW("Invalid graph input descriptor key: " + inputName);
@@ -230,6 +233,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
         }
     }
 
+    _logger.debug("ZeroInferRequest::ZeroInferRequest - checking level zero attributes and allocate tensor");
     for (const std::string& outputName : _metadata.outputNames) {
         IONodeDescriptor& resultDescriptor = _metadata.results.at(outputName);
         checkLevelZeroAttributesMatch(resultDescriptor, executorOutputDescriptors.at(outputName), outputName);
@@ -257,6 +261,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
         }
     }
 
+    _logger.debug("ZeroInferRequest::ZeroInferRequest - capturing latest tensor value in output");
     for (const std::string& stateName : _metadata.stateNames) {
         const std::string& stateInputBufferName = READVALUE_PREFIX + stateName;
         const std::string& stateOutputBufferName = ASSIGN_PREFIX + stateName;
@@ -281,6 +286,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
         allocate_tensor(stateName, stateDescriptor, TensorType::State, allocator);
     }
 
+    _logger.debug("ZeroInferRequest::ZeroInferRequest - constructing pipeline");
     /// Construct pipepline
     _pipeline = makePipeline(_executorPtr,
                              _config,
@@ -289,6 +295,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
                              _npuProfiling,
                              _copyAllTensors,
                              _batchSize);
+    _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest completed");
 }
 
 void ZeroInferRequest::infer() {
@@ -382,6 +389,10 @@ void ZeroInferRequest::check_network_precision(const ov::element::Type_t precisi
         break;
     case ov::element::Type_t::f16:
         break;
+    case ov::element::Type_t::u4:
+        break;
+    case ov::element::Type_t::i4:
+        break;
     case ov::element::Type_t::u8:
         break;
     case ov::element::Type_t::i8:
@@ -400,11 +411,12 @@ void ZeroInferRequest::check_network_precision(const ov::element::Type_t precisi
         break;
     default:
         OPENVINO_THROW("Unsupported tensor precision: " + ov::element::Type(precision).get_type_name() +
-                       "! Supported precisions: FP32, FP16, U8, I8, U16, I16, U32, I32, U64, I64");
+                       "! Supported precisions: FP32, FP16, U4, I4, U8, I8, U16, I16, U32, I32, U64, I64");
     }
 }
 
 std::vector<ov::ProfilingInfo> ZeroInferRequest::get_profiling_info() const {
+    _logger.debug("InferRequest::get_profiling_info started");
     const auto& compiledModel = *std::dynamic_pointer_cast<const ICompiledModel>(_compiledModel);
     const auto& compilerConfig = compiledModel.get_config();
     if (!compilerConfig.get<PERF_COUNT>() || !_config.get<PERF_COUNT>()) {
@@ -428,6 +440,7 @@ std::vector<ov::ProfilingInfo> ZeroInferRequest::get_profiling_info() const {
             return _profilingQuery.getLayerStatistics();
         }
     }
+    _logger.debug("InferRequest::get_profiling_info completed");
 }
 
 std::vector<uint8_t> ZeroInferRequest::get_raw_profiling_data() const {
diff --git a/src/plugins/intel_npu/src/backend/src/zero_init.cpp b/src/plugins/intel_npu/src/backend/src/zero_init.cpp
index 05e1a76b8d5215..9efeda7b01f4a9 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_init.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_init.cpp
@@ -62,6 +62,7 @@ static std::tuple<uint32_t, std::string> queryDriverExtensionVersion(ze_driver_h
 
 ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", Logger::global().level()) {
     OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "ZeroInitStructsHolder::ZeroInitStructsHolder");
+    log.debug("ZeroInitStructsHolder - performing zeInit on VPU only");
     zeroUtils::throwOnFail("zeInit", zeInit(ZE_INIT_FLAG_VPU_ONLY));
 
     uint32_t drivers = 0;
@@ -72,6 +73,7 @@ ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder",
 
     // Get our target driver
     driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES;
+    log.debug("ZeroInitStructsHolder - setting driver properties to ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES");
     for (uint32_t i = 0; i < drivers; ++i) {
         zeDriverGetProperties(all_drivers[i], &driver_properties);
 
@@ -105,6 +107,7 @@ ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder",
 
     // Query our graph extension version
     std::string graph_ext_name;
+    log.debug("ZeroInitStructsHolder - tie output of queryDriverExtensionVersion");
     std::tie(driver_ext_version, graph_ext_name) = queryDriverExtensionVersion(driver_handle);
 
     log.debug("Found Driver Version %d.%d, Driver Extension Version %d.%d (%s)",
@@ -136,10 +139,12 @@ ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder",
 
     ze_context_desc_t context_desc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, 0, 0};
     zeroUtils::throwOnFail("zeContextCreate", zeContextCreate(driver_handle, &context_desc, &context));
+    log.debug("ZeroInitStructsHolder initialize complete");
 }
 
 ZeroInitStructsHolder::~ZeroInitStructsHolder() {
     if (context) {
+        log.debug("ZeroInitStructsHolder - performing zeContextDestroy");
         auto result = zeContextDestroy(context);
         if (ZE_RESULT_SUCCESS != result) {
             log.error("zeContextDestroy failed %#X", uint64_t(result));
diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
index 38bebff17de601..c34f5578e55120 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
@@ -35,7 +35,9 @@ struct DiscretePipeline final : public Pipeline {
           _event_pool(device_handle, context, stage::COUNT, _config),
           _event{{{_event_pool.handle(), stage::UPLOAD, _config},
                   {_event_pool.handle(), stage::EXECUTE, _config},
-                  {_event_pool.handle(), stage::READBACK, _config}}} {
+                  {_event_pool.handle(), stage::READBACK, _config}}},
+          _logger("DiscretePipeline", _config.get<LOG_LEVEL>()) {
+        _logger.debug("DiscretePipeline - initialize started");
         const ZeroExecutor* executor = static_cast<const ZeroExecutor*>(executorPtr.get());
         static const std::size_t alignment = STANDARD_PAGE_SIZE;
 
@@ -45,6 +47,8 @@ struct DiscretePipeline final : public Pipeline {
         }
         _deviceInputs.allocate(device_handle, context);
 
+        _logger.debug("DiscretePipeline - appending memory copy and set argument value for input");
+
         for (const auto& desc : executor->inputs_desc_map()) {
             const std::shared_ptr<ov::ITensor>& inputTensor = tensors.at(desc.first);
             const void* tensorBuffer = reinterpret_cast<const void*>(inputTensor->data());
@@ -57,6 +61,8 @@ struct DiscretePipeline final : public Pipeline {
             executor->setArgumentValue(desc.second.idx, _deviceInputs.getDevicePtr(desc.first));
         }
 
+        _logger.debug("DiscretePipeline - append signal event");
+
         _command_list[stage::UPLOAD].appendBarrier();
         _event[stage::UPLOAD].AppendSignalEvent(_command_list[stage::UPLOAD]);
 
@@ -65,6 +71,7 @@ struct DiscretePipeline final : public Pipeline {
         }
         _deviceOutputs.allocate(device_handle, context);
 
+        _logger.debug("DiscretePipeline - appending memory copy and set argument value for output");
         for (const auto& desc : executor->outputs_desc_map()) {
             const std::shared_ptr<ov::ITensor>& outputTensor = tensors.at(desc.first);
             void* tensorBuffer = reinterpret_cast<void*>(outputTensor->data());
@@ -80,14 +87,15 @@ struct DiscretePipeline final : public Pipeline {
         }
 
         _event[stage::UPLOAD].AppendWaitOnEvent(_command_list[stage::EXECUTE]);
-
+        _logger.debug("DiscretePipeline - appendGraphExecute");
         _command_list[stage::EXECUTE].appendGraphExecute(executor->graph(), profiling_handle);
-
+        _logger.debug("DiscretePipeline - appendEventReset");
         _event[stage::UPLOAD].AppendEventReset(_command_list[stage::READBACK]);
 
         for (auto& commandList : _command_list) {
             commandList.close();
         }
+        _logger.debug("DiscretePipeline - initialize completed");
     };
 
     DiscretePipeline(const DiscretePipeline&) = delete;
@@ -95,6 +103,7 @@ struct DiscretePipeline final : public Pipeline {
     virtual ~DiscretePipeline() = default;
 
     void push(size_t) override {
+        _logger.debug("DiscretePipeline - push() started");
         OV_ITT_TASK_CHAIN(ZERO_INFER_REQUEST_DP_PUSH,
                           itt::domains::LevelZeroBackend,
                           "DiscretePipeline::push",
@@ -105,9 +114,11 @@ struct DiscretePipeline final : public Pipeline {
         OV_ITT_TASK_NEXT(ZERO_INFER_REQUEST_DP_PUSH, "EXECUTE");
         // Submit the command list for execute
         _command_queues[stage::EXECUTE]->executeCommandList(_command_list[stage::EXECUTE], _fence[stage::EXECUTE]);
+        _logger.debug("DiscretePipeline - push() completed");
     };
 
     void pull(size_t) override {
+        _logger.debug("DiscretePipeline - pull() started");
         OV_ITT_TASK_CHAIN(ZERO_INFER_REQUEST_DP_PULL,
                           itt::domains::LevelZeroBackend,
                           "DiscretePipeline::pull",
@@ -120,6 +131,7 @@ struct DiscretePipeline final : public Pipeline {
         // Wait for output copy to finish execution for _fence from the host, to make sure that data
         // is available in the hostMem buffer of the output
         _fence[stage::READBACK].hostSynchronize();
+        _logger.debug("DiscretePipeline - pull() completed");
     };
 
     void reset(size_t) const override {
@@ -136,6 +148,7 @@ struct DiscretePipeline final : public Pipeline {
     std::array<Fence, stage::COUNT> _fence;
     EventPool _event_pool;
     std::array<Event, stage::COUNT> _event;
+    Logger _logger;
 };
 
 struct IntegratedPipeline final : public Pipeline {
@@ -154,15 +167,18 @@ struct IntegratedPipeline final : public Pipeline {
         : _config(config),
           _command_queue{command_queue},
           _event_pool{device_handle, context, batch_size ? static_cast<uint32_t>(batch_size) : 1, _config},
-          _npu_profiling(std::move(npu_profiling)) {
+          _npu_profiling(std::move(npu_profiling)),
+          _logger("IntegratedPipeline", _config.get<LOG_LEVEL>()) {
         const ZeroExecutor* executor = static_cast<const ZeroExecutor*>(executorPtr.get());
 
         OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend,
                            "Zero_infer_request::IntegratedPipeline::IntegratedPipeline");
+        _logger.debug("IntegratedPipeline - initialize started");
 
         _command_lists.reserve(batch_size);
         _events.reserve(batch_size);
         _fences.reserve(batch_size);
+        _logger.debug("IntegratedPipeline - emplace_back _event_pool and _command_queue");
         for (size_t i = 0; i < batch_size; i++) {
             _command_lists.emplace_back(
                 std::make_unique<CommandList>(device_handle, context, graph_ddi_table_ext, _config, group_ordinal));
@@ -209,6 +225,7 @@ struct IntegratedPipeline final : public Pipeline {
             }
             _command_lists.at(i)->close();
         }
+        _logger.debug("IntegratedPipeline - initialize completed");
     }
 
     IntegratedPipeline(const IntegratedPipeline&) = delete;
@@ -216,15 +233,18 @@ struct IntegratedPipeline final : public Pipeline {
     virtual ~IntegratedPipeline() = default;
 
     void push(size_t batch_index) override {
+        _logger.debug("IntegratedPipeline - push() started");
         OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PUSH, itt::domains::LevelZeroBackend, "IntegratedPipeline", "push");
         if (sync_output_with_fences_) {
             _command_queue.executeCommandList(*_command_lists.at(batch_index), *_fences.at(batch_index));
         } else {
             _command_queue.executeCommandList(*_command_lists.at(batch_index));
         }
+        _logger.debug("IntegratedPipeline - push() completed");
     };
 
     void pull(size_t batch_index) override {
+        _logger.debug("IntegratedPipeline - pull() started");
         OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PULL, itt::domains::LevelZeroBackend, "IntegratedPipeline", "pull");
         if (sync_output_with_fences_) {
             _fences.at(batch_index)->hostSynchronize();
@@ -235,14 +255,17 @@ struct IntegratedPipeline final : public Pipeline {
         if (_npu_profiling != nullptr) {
             _npu_profiling->sampleNpuTimestamps();
         }
+        _logger.debug("IntegratedPipeline - pull() completed");
     };
 
     void reset(size_t batch_index) const override {
+        _logger.debug("IntegratedPipeline - rest() started");
         if (sync_output_with_fences_) {
             _fences.at(batch_index)->reset();
         } else {
             _events.at(batch_index)->reset();
         }
+        _logger.debug("IntegratedPipeline - rest() completed");
     };
 
 private:
@@ -254,6 +277,7 @@ struct IntegratedPipeline final : public Pipeline {
     std::vector<std::unique_ptr<Event>> _events;
     bool sync_output_with_fences_ = true;
     std::shared_ptr<zeroProfiling::NpuInferProfiling> _npu_profiling;
+    Logger _logger;
 };
 
 std::unique_ptr<Pipeline> makePipeline(const std::shared_ptr<const IExecutor>& executorPtr,
diff --git a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp b/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp
index 2bb2f951d4d634..9607fdbdd7749e 100644
--- a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp
+++ b/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp
@@ -763,6 +763,8 @@ NetworkDescription LevelZeroCompilerInDriver<TableExtension>::compileIR(const st
                     getLatestBuildError());
 
     auto networkMeta = getNetworkMeta(graphHandle);
+    networkMeta.name = model->get_friendly_name();
+
     result = _graphDdiTableExt->pfnDestroy(graphHandle);
 
     if (ZE_RESULT_SUCCESS != result) {
diff --git a/src/plugins/template/backend/ops/ops_evaluates.hpp b/src/plugins/template/backend/ops/ops_evaluates.hpp
index 8a44f4f6f2e6cc..8e7d24f82092a5 100644
--- a/src/plugins/template/backend/ops/ops_evaluates.hpp
+++ b/src/plugins/template/backend/ops/ops_evaluates.hpp
@@ -7,6 +7,7 @@
 #include "openvino/op/rms_norm.hpp"
 #include "ov_ops/augru_cell.hpp"
 #include "ov_ops/augru_sequence.hpp"
+#include "ov_ops/rms.hpp"
 
 extern template bool evaluate_node<ov::op::v0::Abs>(std::shared_ptr<ov::Node> node,
                                                     ov::TensorVector& outputs,
@@ -498,7 +499,7 @@ extern template bool evaluate_node<ov::op::v15::Col2Im>(std::shared_ptr<ov::Node
                                                         ov::TensorVector& outputs,
                                                         const ov::TensorVector& inputs);
 
-extern template bool evaluate_node<ov::op::v14::ROIAlignRotated>(std::shared_ptr<ov::Node> node,
+extern template bool evaluate_node<ov::op::v15::ROIAlignRotated>(std::shared_ptr<ov::Node> node,
                                                                  ov::TensorVector& outputs,
                                                                  const ov::TensorVector& inputs);
 
@@ -516,3 +517,7 @@ extern template bool evaluate_node<ov::op::internal::AUGRUCell>(std::shared_ptr<
 extern template bool evaluate_node<ov::op::internal::AUGRUSequence>(std::shared_ptr<ov::Node> node,
                                                                     ov::TensorVector& outputs,
                                                                     const ov::TensorVector& inputs);
+
+extern template bool evaluate_node<ov::op::internal::RMS>(std::shared_ptr<ov::Node> node,
+                                                          ov::TensorVector& outputs,
+                                                          const ov::TensorVector& inputs);
diff --git a/src/plugins/template/backend/ops/rms_internal.cpp b/src/plugins/template/backend/ops/rms_internal.cpp
new file mode 100644
index 00000000000000..ea8183f843ccd3
--- /dev/null
+++ b/src/plugins/template/backend/ops/rms_internal.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "evaluate_node.hpp"
+#include "openvino/core/axis_set.hpp"
+#include "openvino/core/rank.hpp"
+#include "openvino/core/validation_util.hpp"
+#include "openvino/op/util/axes_util.hpp"
+#include "openvino/reference/rms_norm.hpp"
+#include "openvino/runtime/tensor.hpp"
+#include "ov_ops/rms.hpp"
+#include "utils.hpp"
+
+using namespace ov;
+
+template <element::Type_t T>
+bool evaluate(const std::shared_ptr<ov::op::internal::RMS>& node,
+              ov::TensorVector& outputs,
+              const ov::TensorVector& inputs) {
+    using ET = typename ov::element_type_traits<T>::value_type;
+    const auto normalized_axes =
+        ov::util::normalize_axes(node->get_friendly_name(), std::vector<int64_t>{-1}, inputs[0].get_shape().size());
+
+    outputs[0].set_shape(inputs[0].get_shape());
+
+    const auto& in_type = inputs[0].get_element_type();
+    const auto& out_type = outputs[0].get_element_type();
+
+    // The type compression mechanism is implemented for F16 only
+    // The scale is expected to have the same type as the first input
+    if (in_type != out_type && out_type == ov::element::f16) {
+        ov::reference::rms_norm_mul_convert_out(inputs[0].data<ET>(),
+                                                normalized_axes,
+                                                outputs[0].data<ov::float16>(),
+                                                inputs[0].get_shape(),
+                                                node->get_epsilon(),
+                                                inputs[1].get_shape(),
+                                                inputs[1].data<ET>());
+
+    } else {
+        ov::reference::rms_norm(inputs[0].data<ET>(),
+                                normalized_axes,
+                                outputs[0].data<ET>(),
+                                inputs[0].get_shape(),
+                                node->get_epsilon(),
+                                inputs[1].get_shape(),
+                                inputs[1].data<ET>());
+    }
+    return true;
+}
+
+template <>
+bool evaluate_node<op::internal::RMS>(std::shared_ptr<ov::Node> node,
+                                      ov::TensorVector& outputs,
+                                      const ov::TensorVector& inputs) {
+    switch (node->get_input_element_type(0)) {
+    case element::bf16:
+        return evaluate<element::bf16>(as_type_ptr<op::internal::RMS>(node), outputs, inputs);
+    case element::f16:
+        return evaluate<element::f16>(as_type_ptr<op::internal::RMS>(node), outputs, inputs);
+    case element::f64:
+        return evaluate<element::f64>(as_type_ptr<op::internal::RMS>(node), outputs, inputs);
+    case element::f32:
+        return evaluate<element::f32>(as_type_ptr<op::internal::RMS>(node), outputs, inputs);
+    default:
+        OPENVINO_THROW("Unhandled data type ", node->get_input_element_type(0).get_type_name(), " in evaluate_node()");
+    }
+}
diff --git a/src/plugins/template/backend/ops/roi_align_rotated.cpp b/src/plugins/template/backend/ops/roi_align_rotated.cpp
index ec409a73b072ca..60373931dffb1d 100644
--- a/src/plugins/template/backend/ops/roi_align_rotated.cpp
+++ b/src/plugins/template/backend/ops/roi_align_rotated.cpp
@@ -8,7 +8,7 @@
 #include "openvino/reference/roi_align.hpp"
 
 template <ov::element::Type_t ET>
-bool evaluate(const std::shared_ptr<ov::op::v14::ROIAlignRotated>& op,
+bool evaluate(const std::shared_ptr<ov::op::v15::ROIAlignRotated>& op,
               ov::TensorVector& outputs,
               const ov::TensorVector& inputs) {
     using T = typename ov::element_type_traits<ET>::value_type;
@@ -33,14 +33,14 @@ bool evaluate(const std::shared_ptr<ov::op::v14::ROIAlignRotated>& op,
 }
 
 template <>
-bool evaluate_node<ov::op::v14::ROIAlignRotated>(std::shared_ptr<ov::Node> node,
+bool evaluate_node<ov::op::v15::ROIAlignRotated>(std::shared_ptr<ov::Node> node,
                                                  ov::TensorVector& outputs,
                                                  const ov::TensorVector& inputs) {
     const auto& element_type = node->get_output_element_type(0);
 
 #define CASE(type)          \
     case ov::element::type: \
-        return evaluate<ov::element::type>(ov::as_type_ptr<ov::op::v14::ROIAlignRotated>(node), outputs, inputs);
+        return evaluate<ov::element::type>(ov::as_type_ptr<ov::op::v15::ROIAlignRotated>(node), outputs, inputs);
 
     switch (element_type) {
         CASE(bf16);
diff --git a/src/plugins/template/backend/opset_int_tbl.hpp b/src/plugins/template/backend/opset_int_tbl.hpp
index 4f1f14f2634867..6174e65f76444c 100644
--- a/src/plugins/template/backend/opset_int_tbl.hpp
+++ b/src/plugins/template/backend/opset_int_tbl.hpp
@@ -162,7 +162,8 @@ _OPENVINO_OP_REG(Multinomial, ov::op::v13)
 _OPENVINO_OP_REG(Inverse, ov::op::v14)
 _OPENVINO_OP_REG(AvgPool, ov::op::v14)
 _OPENVINO_OP_REG(MaxPool, ov::op::v14)
-_OPENVINO_OP_REG(ROIAlignRotated, ov::op::v14)
+
+_OPENVINO_OP_REG(ROIAlignRotated, ov::op::v15)
 
 _OPENVINO_OP_REG(EmbeddingBagOffsets, op::v15)
 _OPENVINO_OP_REG(EmbeddingBagPacked, op::v15)
@@ -170,4 +171,5 @@ _OPENVINO_OP_REG(Col2Im, ov::op::v15)
 
 _OPENVINO_OP_REG(AUGRUCell, ov::op::internal)
 _OPENVINO_OP_REG(AUGRUSequence, ov::op::internal)
+_OPENVINO_OP_REG(RMS, ov::op::internal)
 _OPENVINO_OP_REG(RMSNorm, ov::op::internal)
diff --git a/src/plugins/template/tests/functional/op_reference/base_reference_test.cpp b/src/plugins/template/tests/functional/op_reference/base_reference_test.cpp
index 239405dbca6b19..f457ab09b90846 100644
--- a/src/plugins/template/tests/functional/op_reference/base_reference_test.cpp
+++ b/src/plugins/template/tests/functional/op_reference/base_reference_test.cpp
@@ -76,13 +76,7 @@ void CommonReferenceTest::Validate() {
 
     ASSERT_EQ(refOutData.size(), actualOutData.size());
     for (size_t i = 0; i < refOutData.size(); i++) {
-        ValidateBlobs(refOutData[i],
-                      actualOutData[i],
-                      i,
-                      threshold,
-                      abs_threshold,
-                      legacy_compare,
-                      actual_comparision_size);
+        ValidateBlobs(refOutData[i], actualOutData[i], i, threshold, abs_threshold, legacy_compare);
     }
 }
 
@@ -91,15 +85,12 @@ void CommonReferenceTest::ValidateBlobs(const ov::Tensor& refBlob,
                                         const size_t blob_idx,
                                         float threshold,
                                         float abs_threshold,
-                                        bool legacy_compare,
-                                        size_t actual_comparision_size) {
+                                        bool legacy_compare) {
     ASSERT_EQ(refBlob.get_element_type(), outBlob.get_element_type())
         << "Incompatible element type for blob with index " << blob_idx;
     ASSERT_EQ(refBlob.get_byte_size(), outBlob.get_byte_size())
         << "Incorrect byte size for blob with index " << blob_idx;
 
-    if (actual_comparision_size == 0)
-        actual_comparision_size = refBlob.get_size();
     // compare() get fundamental element type with element_type_traits firstly and cast data to relative ov type with
     // 'from' types listed below have a fundamental analogue as int8_t, but int8_t is converted only to i8 with from
     std::vector<ov::element::Type> raw_data_comp_only =
@@ -133,6 +124,8 @@ void CommonReferenceTest::ValidateBlobs(const ov::Tensor& refBlob,
         }
         return;
     }
+
+    const auto actual_comparision_size = refBlob.get_size();
     switch (element_type) {
     case ov::element::bf16:
         ov::test::utils::compare_raw_data<ov::bfloat16, ov::bfloat16>(refBlob.data<const ov::bfloat16>(),
diff --git a/src/plugins/template/tests/functional/op_reference/base_reference_test.hpp b/src/plugins/template/tests/functional/op_reference/base_reference_test.hpp
index 94923ca58cb8ef..a228c908850b42 100644
--- a/src/plugins/template/tests/functional/op_reference/base_reference_test.hpp
+++ b/src/plugins/template/tests/functional/op_reference/base_reference_test.hpp
@@ -28,8 +28,7 @@ class CommonReferenceTest {
                               const size_t blob_idx,
                               float threshold,
                               float abs_threshold,
-                              bool legacy_compare,
-                              size_t actual_comparision_size);
+                              bool legacy_compare);
 
 protected:
     bool legacy_compare = false;
@@ -42,9 +41,8 @@ class CommonReferenceTest {
     std::vector<ov::Tensor> inputData;
     std::vector<ov::Tensor> refOutData;
     std::vector<ov::Tensor> actualOutData;
-    float threshold = 1e-2f;             // Relative diff
-    float abs_threshold = -1.f;          // Absolute diff (not used when negative)
-    size_t actual_comparision_size = 0;  // For ref output data is smaller than output blob size
+    float threshold = 1e-2f;     // Relative diff
+    float abs_threshold = -1.f;  // Absolute diff (not used when negative)
 };
 
 template <class T>
diff --git a/src/plugins/template/tests/functional/op_reference/experimental_detectron_detection_prior_grid.cpp b/src/plugins/template/tests/functional/op_reference/experimental_detectron_detection_prior_grid.cpp
index 5450c11a74a41e..58d577cb6a06aa 100644
--- a/src/plugins/template/tests/functional/op_reference/experimental_detectron_detection_prior_grid.cpp
+++ b/src/plugins/template/tests/functional/op_reference/experimental_detectron_detection_prior_grid.cpp
@@ -7,9 +7,10 @@
 #include "base_reference_test.hpp"
 #include "openvino/op/experimental_detectron_prior_grid_generator.hpp"
 
-using namespace reference_tests;
 using namespace ov;
 
+using reference_tests::CommonReferenceTest;
+using reference_tests::CreateTensor;
 using Attrs = op::v6::ExperimentalDetectronPriorGridGenerator::Attributes;
 
 namespace {
@@ -30,7 +31,6 @@ struct ExperimentalPGGParams {
           imageSizeInfoShape(imageSizeInfoShape),
           outRefShape(outRefShape),
           inType(iType),
-          outType(iType),
           priorsData(CreateTensor(iType, priorsValues)),
           refData(CreateTensor(outRefShape, iType, refValues)),
           testcaseName(testcaseName) {
@@ -54,12 +54,11 @@ struct ExperimentalPGGParams {
     PartialShape imageSizeInfoShape;
     Shape outRefShape;
     size_t actualComparisonSize;
-    ov::element::Type inType;
-    ov::element::Type outType;
-    ov::Tensor priorsData;
-    ov::Tensor featureMapData;
-    ov::Tensor imageSizeInfoData;
-    ov::Tensor refData;
+    element::Type inType;
+    Tensor priorsData;
+    Tensor featureMapData;
+    Tensor imageSizeInfoData;
+    Tensor refData;
     std::string testcaseName;
 };
 
@@ -67,23 +66,18 @@ class ReferenceExperimentalPGGLayerTest : public testing::TestWithParam<Experime
                                           public CommonReferenceTest {
 public:
     void SetUp() override {
-        legacy_compare = true;
-        auto params = GetParam();
+        const auto& params = GetParam();
         function = CreateFunction(params);
         inputData = {params.priorsData, params.featureMapData, params.imageSizeInfoData};
         refOutData = {params.refData};
-
-        if (params.actualComparisonSize > 0)
-            actual_comparision_size = params.actualComparisonSize;
     }
     static std::string getTestCaseName(const testing::TestParamInfo<ExperimentalPGGParams>& obj) {
-        auto param = obj.param;
+        const auto& param = obj.param;
         std::ostringstream result;
         result << "priorsShape=" << param.priorsShape << "_";
         result << "featureMapShape=" << param.featureMapShape << "_";
         result << "imageSizeInfoShape=" << param.imageSizeInfoShape << "_";
         result << "iType=" << param.inType << "_";
-        result << "oType=" << param.outType << "_";
         result << "flatten=" << param.attrs.flatten << "_";
         result << "h=" << param.attrs.h << "_";
         result << "w=" << param.attrs.w << "_";
@@ -94,6 +88,26 @@ class ReferenceExperimentalPGGLayerTest : public testing::TestWithParam<Experime
         return result.str();
     }
 
+    virtual void Validate() override {
+        if (const auto comparison_size = GetParam().actualComparisonSize) {
+            ASSERT_EQ(executableNetwork.outputs().size(), refOutData.size());
+
+            actualOutData.clear();
+            actualOutData.emplace_back(inferRequest.get_tensor(executableNetwork.output(0)));
+
+            // Shape matters: the trick is that hard-coded expected data is "shorter" than runtime inferred data. This
+            // is due to huge size of the tensor and in such case the test provides a part of reference values for
+            // comparison to avoid huge file size.
+            ASSERT_EQ(refOutData[0].get_shape(), actualOutData[0].get_shape());
+            const auto shape = Shape{comparison_size};
+            const auto expected = Tensor{refOutData[0].get_element_type(), shape, refOutData[0].data()};
+            const auto inferred = Tensor{actualOutData[0].get_element_type(), shape, actualOutData[0].data()};
+            ValidateBlobs(expected, inferred, 0, threshold, abs_threshold, false);
+        } else {
+            CommonReferenceTest::Validate();
+        }
+    }
+
 private:
     static std::shared_ptr<Model> CreateFunction(const ExperimentalPGGParams& params) {
         const auto priors = std::make_shared<op::v0::Parameter>(params.inType, params.priorsShape);
@@ -103,7 +117,7 @@ class ReferenceExperimentalPGGLayerTest : public testing::TestWithParam<Experime
                                                                                                        featureMap,
                                                                                                        im_info,
                                                                                                        params.attrs);
-        return std::make_shared<ov::Model>(NodeVector{ExperimentalPGG}, ParameterVector{priors, featureMap, im_info});
+        return std::make_shared<Model>(NodeVector{ExperimentalPGG}, ParameterVector{priors, featureMap, im_info});
     }
 };
 
diff --git a/src/plugins/template/tests/functional/op_reference/memory.cpp b/src/plugins/template/tests/functional/op_reference/memory.cpp
index 59ad59106eba66..ef16fae9f73e10 100644
--- a/src/plugins/template/tests/functional/op_reference/memory.cpp
+++ b/src/plugins/template/tests/functional/op_reference/memory.cpp
@@ -302,8 +302,7 @@ class ReferenceMemoryTest : public testing::TestWithParam<MemoryTestParams> {
                                                                 i,
                                                                 1e-2f,
                                                                 -1.f,
-                                                                true,
-                                                                0);
+                                                                true);
         }
     }
 
diff --git a/src/plugins/template/tests/functional/op_reference/rms_internal.cpp b/src/plugins/template/tests/functional/op_reference/rms_internal.cpp
new file mode 100644
index 00000000000000..433d2e710d2a2d
--- /dev/null
+++ b/src/plugins/template/tests/functional/op_reference/rms_internal.cpp
@@ -0,0 +1,437 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "base_reference_test.hpp"
+#include "common_test_utils/common_utils.hpp"
+#include "openvino/op/constant.hpp"
+#include "ov_ops/rms.hpp"
+
+using namespace ov;
+using namespace reference_tests;
+
+struct RMSParams {
+    RMSParams(const reference_tests::Tensor& paramInput,
+              const reference_tests::Tensor& paramReductionAxes,
+              const double eps,
+              const reference_tests::Tensor& paramExpected,
+              const reference_tests::Tensor& paramScale = {})
+        : input(paramInput),
+          reductionAxes(paramReductionAxes),
+          eps(eps),
+          expected(paramExpected) {
+        if (paramScale.data) {
+            scale = paramScale;
+        }
+    }
+    reference_tests::Tensor input;
+    reference_tests::Tensor scale;
+    // Warning: Axes input is not currently supported by internal::RMS, it's always assumed to be "-1"
+    reference_tests::Tensor reductionAxes;
+    double eps;
+    reference_tests::Tensor expected;
+};
+
+class ReferenceRMSLayerTest : public testing::TestWithParam<RMSParams>, public CommonReferenceTest {
+public:
+    void SetUp() override {
+        auto params = GetParam();
+        const auto output_type =
+            params.expected.type == params.input.type ? ov::element::undefined : params.expected.type;
+        function = CreateFunction(params.input, params.eps, params.scale, output_type);
+        if (!params.scale.data) {
+            inputData = {params.input.data};
+        } else {
+            inputData = {params.input.data, params.scale.data};
+        }
+        refOutData = {params.expected.data};
+        if (params.input.type == ov::element::f32) {
+            threshold = 1e-5f;  // Set more precise threshold to detect eps changes
+        }
+    }
+    static std::string getTestCaseName(const testing::TestParamInfo<RMSParams>& obj) {
+        auto param = obj.param;
+        std::ostringstream result;
+        result << "shape=" << param.input.shape;
+        result << "_iType=" << param.input.type;
+        result << "_oType=" << param.expected.type;
+        result << "_axesType=" << param.reductionAxes.type;
+        result << "_reductionAxes="
+               << ov::test::utils::vec2str(op::v0::Constant(param.reductionAxes.data).cast_vector<int64_t>());
+        if (param.scale.data) {
+            result << "_scaleShape=" << param.scale.shape;
+        }
+        result << "_eps=" << param.eps;
+        return result.str();
+    }
+
+private:
+    static std::shared_ptr<Model> CreateFunction(const reference_tests::Tensor& input,
+                                                 const double eps,
+                                                 const reference_tests::Tensor& scale,
+                                                 const ov::element::Type& output_type) {
+        const auto in = std::make_shared<op::v0::Parameter>(input.type, input.shape);
+
+        if (!scale.data) {
+            const auto scale_const = std::make_shared<op::v0::Constant>(input.type, input.shape, 1.0);
+            const auto rms_norm = std::make_shared<op::internal::RMS>(in, scale_const, eps, output_type);
+            return std::make_shared<ov::Model>(NodeVector{rms_norm}, ParameterVector{in});
+        }
+        const auto scale_param = std::make_shared<op::v0::Parameter>(scale.type, scale.shape);
+        const auto rms_norm = std::make_shared<op::internal::RMS>(in, scale_param, eps, output_type);
+        return std::make_shared<ov::Model>(NodeVector{rms_norm}, ParameterVector{in, scale_param});
+    }
+};
+
+TEST_P(ReferenceRMSLayerTest, CompareWithHardcodedRefs) {
+    Exec();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_RMSInternal_With_Hardcoded_Refs,
+    ReferenceRMSLayerTest,
+    ::testing::Values(
+        RMSParams(reference_tests::Tensor{Shape{8},
+                                          ov::element::f32,
+                                          std::vector<float>({-6.44250308,
+                                                              -59.65135475,
+                                                              28.08134504,
+                                                              -3.38603289,
+                                                              1.047344,
+                                                              -22.62146978,
+                                                              58.72749089,
+                                                              16.00083578})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{Shape{8},
+                                          ov::element::f32,
+                                          std::vector<float>{-0.19629386,
+                                                             -1.81749151,
+                                                             0.85559844,
+                                                             -0.10316758,
+                                                             0.03191107,
+                                                             -0.68924385,
+                                                             1.7893427,
+                                                             0.48752259}}),
+        RMSParams(reference_tests::Tensor{Shape{8},
+                                          ov::element::f32,
+                                          std::vector<float>({-6.44250308,
+                                                              -59.65135475,
+                                                              28.08134504,
+                                                              -3.38603289,
+                                                              1.047344,
+                                                              -22.62146978,
+                                                              58.72749089,
+                                                              16.00083578})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i32, std::vector<int32_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{Shape{8},
+                                          ov::element::f32,
+                                          std::vector<float>{-0.19629386,
+                                                             -1.81749151,
+                                                             0.85559844,
+                                                             -0.10316758,
+                                                             0.03191107,
+                                                             -0.68924385,
+                                                             1.7893427,
+                                                             0.48752259}}),
+        RMSParams(reference_tests::Tensor{Shape{8},
+                                          ov::element::f32,
+                                          std::vector<float>({-6.44250308,
+                                                              -59.65135475,
+                                                              28.08134504,
+                                                              -3.38603289,
+                                                              1.047344,
+                                                              -22.62146978,
+                                                              58.72749089,
+                                                              16.00083578})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i32, std::vector<int32_t>({-1})},
+                  1e-2,
+                  reference_tests::Tensor{Shape{8},
+                                          ov::element::f32,
+                                          std::vector<float>{-0.19629295,
+                                                             -1.81748319,
+                                                             0.85559446,
+                                                             -0.10316710,
+                                                             0.03191093,
+                                                             -0.68924063,
+                                                             1.78933442,
+                                                             0.48752034}}),
+        RMSParams(reference_tests::Tensor{Shape{8},
+                                          ov::element::f32,
+                                          std::vector<float>({-6.44250308,
+                                                              -59.65135475,
+                                                              28.08134504,
+                                                              -3.38603289,
+                                                              1.047344,
+                                                              -22.62146978,
+                                                              58.72749089,
+                                                              16.00083578})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i32, std::vector<int32_t>({-1})},
+                  5.55,
+                  reference_tests::Tensor{Shape{8},
+                                          ov::element::f32,
+                                          std::vector<float>{-0.19579013,
+                                                             -1.81282747,
+                                                             0.85340279,
+                                                             -0.10290283,
+                                                             0.03182918,
+                                                             -0.68747509,
+                                                             1.78475082,
+                                                             0.48627150}}),
+        RMSParams(
+            reference_tests::Tensor{
+                Shape{2, 3},
+                ov::element::f32,
+                std::vector<float>({-6.44250308, -59.65135475, 28.08134504, -3.38603289, 1.047344, -22.62146978})},
+            reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+            1e-5,
+            reference_tests::Tensor{
+                Shape{2, 3},
+                ov::element::f32,
+                std::vector<float>{-0.16844749, -1.559661, 0.7342227, -0.25613253, 0.07922512, -1.71117484}}),
+
+        RMSParams(reference_tests::Tensor{Shape{2, 3, 1},
+                                          ov::element::f32,
+                                          std::vector<float>(
+                                              {-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344, -2.262147})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 3, 1},
+                      ov::element::f32,
+                      std::vector<float>{-0.99998795, -0.99999986, 0.99999937, -0.99995639, 0.99954449, -0.99999902}}),
+
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::f32,
+                                          std::vector<float>({-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344,
+                                                              -2.262147,   5.872749,   1.6000836, -6.754028,  4.015047,
+                                                              9.291021,    0.00016722, 7.7904015, -3.167727,  1.3428825,
+                                                              -1.4490807,  -1.2650547, 5.5311837, 0.71208346, 9.074844,
+                                                              0.8841632,   -8.358102,  -2.673152, 7.01701})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 2, 2, 3},
+                      ov::element::f32,
+                      std::vector<float>{-0.16844743, -1.5596604, 0.7342224,  -0.2561318, 0.0792249,   -1.71117,
+                                         1.1187618,   0.30481678, -1.2866459, 0.687082,   1.5899425,   0.00002862,
+                                         1.5844078,   -0.6442507, 0.27311474, -0.4285907, -0.3741618,  1.6359433,
+                                         0.1348591,   1.7186543,  0.1674487,  -1.288446,  -0.41208065, 1.0817096}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::bf16,
+                                          std::vector<ov::bfloat16>{
+                                              -0.6445, -5.9688, 2.8125, -0.3379, 0.1045, -2.2656, 5.8750,  1.6016,
+                                              -6.7500, 4.0000,  9.3125, 0.0002,  7.7812, -3.1719, 1.3438,  -1.4453,
+                                              -1.2656, 5.5312,  0.7109, 9.0625,  0.8828, -8.3750, -2.6719, 7.0312}},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 2, 2, 3},
+                      ov::element::bf16,
+                      std::vector<ov::bfloat16>{-0.1680, -1.5625, 0.7344, -0.2559, 0.0791, -1.7188, 1.1172,  0.3047,
+                                                -1.2891, 0.6836,  1.5938, 0.0000,  1.5859, -0.6484, 0.2734,  -0.4277,
+                                                -0.3750, 1.6406,  0.1348, 1.7188,  0.1670, -1.2891, -0.4102, 1.0781}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::bf16,
+                                          std::vector<ov::bfloat16>{
+                                              -0.6445, -5.9688, 2.8125, -0.3379, 0.1045, -2.2656, 5.8750,  1.6016,
+                                              -6.7500, 4.0000,  9.3125, 0.0002,  7.7812, -3.1719, 1.3438,  -1.4453,
+                                              -1.2656, 5.5312,  0.7109, 9.0625,  0.8828, -8.3750, -2.6719, 7.0312}},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 2, 2, 3},
+                      ov::element::bf16,
+                      std::vector<ov::bfloat16>{-0.0840, -0.7812, 0.3672, -0.1279, 0.0396, -0.8594, 0.5586,  0.1523,
+                                                -0.6445, 0.3418,  0.7969, 0.0000,  0.7930, -0.3242, 0.1367,  -0.2139,
+                                                -0.1875, 0.8203,  0.0674, 0.8594,  0.0835, -0.6445, -0.2051, 0.5391}},
+                  reference_tests::Tensor{Shape{1}, ov::element::bf16, std::vector<ov::bfloat16>{0.5}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::f16,
+                                          std::vector<ov::float16>{-0.644, -5.965, 2.809,  -0.3386, 0.10474, -2.262,
+                                                                   5.87,   1.6,    -6.754, 4.016,   9.29,    0.0001673,
+                                                                   7.79,   -3.168, 1.343,  -1.449,  -1.265,  5.53,
+                                                                   0.712,  9.08,   0.8843, -8.36,   -2.674,  7.016}},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 2, 2, 3},
+                      ov::element::f16,
+                      std::vector<ov::float16>{-0.1683, -1.559, 0.734,  -0.256,    0.0792, -1.711, 1.118,  0.3047,
+                                               -1.286,  0.687,  1.59,   0.0000286, 1.584,  -0.644, 0.273,  -0.4287,
+                                               -0.374,  1.636,  0.1348, 1.719,     0.1675, -1.288, -0.412, 1.081}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::f16,
+                                          std::vector<ov::float16>{-0.644, -5.965, 2.809,  -0.3386, 0.10474, -2.262,
+                                                                   5.87,   1.6,    -6.754, 4.016,   9.29,    0.0001673,
+                                                                   7.79,   -3.168, 1.343,  -1.449,  -1.265,  5.53,
+                                                                   0.712,  9.08,   0.8843, -8.36,   -2.674,  7.016}},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::f16,
+                                          std::vector<ov::float16>{
+
+                                              -0.08417, -0.7793, 0.367,  -0.128,    0.0396,  -0.8555, 0.559,  0.1523,
+                                              -0.643,   0.3435,  0.795,  0.0000143, 0.792,   -0.322,  0.1365, -0.2144,
+                                              -0.187,   0.818,   0.0674, 0.8594,    0.08374, -0.644,  -0.206, 0.5405}},
+                  reference_tests::Tensor{Shape{1}, ov::element::f16, std::vector<ov::float16>{0.5}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::f64,
+                                          std::vector<double>{-0.64425031, -5.96513547, 2.8081345,   -0.33860329,
+                                                              0.1047344,   -2.26214698, 5.87274909,  1.60008358,
+                                                              -6.75402803, 4.01504693,  9.2910216,   0.00016722,
+                                                              7.79040128,  -3.16772695, 1.34288255,  -1.44908073,
+                                                              -1.26505474, 5.5311837,   0.71208347,  9.07484454,
+                                                              0.8841632,   -8.35810155, -2.67315197, 7.01701008}},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 2, 2, 3},
+                      ov::element::f64,
+                      std::vector<double>{-0.16844743, -1.55966048, 0.73422245, -0.2561318,  0.0792249,   -1.71116999,
+                                          1.1187618,   0.30481677,  -1.2866459, 0.68708204,  1.58994258,  0.00002862,
+                                          1.58440782,  -0.64425068, 0.27311477, -0.42859069, -0.37416182, 1.63594325,
+                                          0.1348591,   1.71865438,  0.1674487,  -1.28844602, -0.41208066, 1.0817096}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::f64,
+                                          std::vector<double>{-0.64425031, -5.96513547, 2.8081345,   -0.33860329,
+                                                              0.1047344,   -2.26214698, 5.87274909,  1.60008358,
+                                                              -6.75402803, 4.01504693,  9.2910216,   0.00016722,
+                                                              7.79040128,  -3.16772695, 1.34288255,  -1.44908073,
+                                                              -1.26505474, 5.5311837,   0.71208347,  9.07484454,
+                                                              0.8841632,   -8.35810155, -2.67315197, 7.01701008}},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 2, 2, 3},
+                      ov::element::f64,
+                      std::vector<double>{-0.08422372, -0.77983024, 0.36711123,  -0.1280659,  0.03961245,  -0.855585,
+                                          0.5593809,   0.15240838,  -0.64332295, 0.34354102,  0.79497129,  0.00001431,
+                                          0.79220391,  -0.32212534, 0.13655738,  -0.21429535, -0.18708091, 0.81797163,
+                                          0.06742955,  0.85932719,  0.08372435,  -0.64422301, -0.20604033, 0.5408548}},
+                  reference_tests::Tensor{Shape{1}, ov::element::f64, std::vector<double>{0.5}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::f32,
+                                          std::vector<float>({-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344,
+                                                              -2.262147,   5.872749,   1.6000836, -6.754028,  4.015047,
+                                                              9.291021,    0.00016722, 7.7904015, -3.167727,  1.3428825,
+                                                              -1.4490807,  -1.2650547, 5.5311837, 0.71208346, 9.074844,
+                                                              0.8841632,   -8.358102,  -2.673152, 7.01701})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 2, 2, 3},
+                      ov::element::f32,
+                      std::vector<float>{-0.08422372, -0.77983022, 0.36711121,  -0.1280659,  0.03961245,  -0.85558498,
+                                         0.55938089,  0.15240839,  -0.64332294, 0.343541,    0.79497123,  0.00001431,
+                                         0.7922039,   -0.32212535, 0.13655737,  -0.21429534, -0.1870809,  0.81797165,
+                                         0.06742955,  0.85932714,  0.08372435,  -0.64422297, -0.20604032, 0.54085481}},
+                  reference_tests::Tensor{Shape{1}, ov::element::f32, std::vector<float>{0.5}}),
+
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::f32,
+                                          std::vector<float>({-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344,
+                                                              -2.262147,   5.872749,   1.6000836, -6.754028,  4.015047,
+                                                              9.291021,    0.00016722, 7.7904015, -3.167727,  1.3428825,
+                                                              -1.4490807,  -1.2650547, 5.5311837, 0.71208346, 9.074844,
+                                                              0.8841632,   -8.358102,  -2.673152, 7.01701})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 2, 2, 3},
+                      ov::element::f32,
+                      std::vector<float>{-0.08422372, -2.33949065, 0.1835556,   -0.1280659,  0.11883735,  -0.42779249,
+                                         0.55938089,  0.45722517,  -0.32166147, 0.343541,    2.38491368,  0.00000715,
+                                         0.7922039,   -0.96637604, 0.06827869,  -0.21429534, -0.56124271, 0.40898582,
+                                         0.06742955,  2.57798141,  0.04186217,  -0.64422297, -0.61812097, 0.27042741}},
+                  reference_tests::Tensor{Shape{1, 3}, ov::element::f32, std::vector<float>{0.5, 1.5, 0.25}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3},
+                                          ov::element::f32,
+                                          std::vector<float>({-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344,
+                                                              -2.262147,   5.872749,   1.6000836, -6.754028,  4.015047,
+                                                              9.291021,    0.00016722, 7.7904015, -3.167727,  1.3428825,
+                                                              -1.4490807,  -1.2650547, 5.5311837, 0.71208346, 9.074844,
+                                                              0.8841632,   -8.358102,  -2.673152, 7.01701})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{2, 2, 2, 3},
+                      ov::element::f16,
+                      std::vector<ov::float16>{-0.08422372, -2.33949065, 0.1835556,   -0.1280659,  0.11883735,
+                                               -0.42779249, 0.55938089,  0.45722517,  -0.32166147, 0.343541,
+                                               2.38491368,  0.00000715,  0.7922039,   -0.96637604, 0.06827869,
+                                               -0.21429534, -0.56124271, 0.40898582,  0.06742955,  2.57798141,
+                                               0.04186217,  -0.64422297, -0.61812097, 0.27042741}},
+                  reference_tests::Tensor{Shape{1, 3}, ov::element::f32, std::vector<float>{0.5, 1.5, 0.25}}),
+        RMSParams(reference_tests::Tensor{Shape{1, 3, 3, 3},
+                                          ov::element::f32,
+                                          std::vector<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5,
+                                                              6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-5,
+                  reference_tests::Tensor{
+                      Shape{1, 3, 3, 3},
+                      ov::element::f32,
+                      std::vector<float>{0.46290955, 0.92581911, 1.38872866, 0.78954188, 0.98692735, 1.18431282,
+                                         0.87047794, 0.99483193, 1.11918592, 0.46290955, 0.92581911, 1.38872866,
+                                         0.78954188, 0.98692735, 1.18431282, 0.87047794, 0.99483193, 1.11918592,
+                                         0.46290955, 0.92581911, 1.38872866, 0.78954188, 0.98692735, 1.18431282,
+                                         0.87047794, 0.99483193, 1.11918592}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 3, 4},
+                                          ov::element::f16,
+                                          std::vector<ov::float16>({-64.44, -596.5, 280.8,  -33.88, 10.48,  -226.2,
+                                                                    587.5,  160.,   -675.5, 401.5,  929.,   0.01672,
+                                                                    779.,   -316.8, 134.2,  -144.9, -126.5, 553.,
+                                                                    71.2,   907.5,  88.44,  -836.,  -267.2, 701.5})},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-1,
+                  reference_tests::Tensor{
+                      Shape{2, 3, 4},
+                      ov::element::f16,
+                      // Expected overwlow due to f16 accumulation
+                      // (that's why the conversion to fp32 is needed, tested below)
+                      std::vector<ov::float16>{-0., -0., 0., -0., 0.,  -0., 0., 0., -0., 0.,  0.,  0.,
+                                               0.,  -0., 0., -0., -0., 0.,  0., 0., 0.,  -0., -0., 0.}},
+                  reference_tests::Tensor{Shape{1}, ov::element::f16, std::vector<ov::float16>{1.0}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 3, 4},
+                                          ov::element::f32,
+                                          std::vector<float>{-64.44, -596.5, 280.8,  -33.88, 10.48,  -226.2,
+                                                             587.5,  160.,   -675.5, 401.5,  929.,   0.01672,
+                                                             779.,   -316.8, 134.2,  -144.9, -126.5, 553.,
+                                                             71.2,   907.5,  88.44,  -836.,  -267.2, 701.5}},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-1,
+                  reference_tests::Tensor{
+                      Shape{2, 3, 4},
+                      ov::element::f16,
+                      std::vector<ov::float16>{-0.19433594, -1.79882812, 0.84667969,  -0.10217285, 0.03225708,
+                                               -0.69628906, 1.80859375,  0.49267578,  -1.11035156, 0.66015625,
+                                               1.52734375,  0.00002748,  1.80371094,  -0.73339844, 0.31079102,
+                                               -0.33544922, -0.23583984, 1.03125000,  0.13269043,  1.69238281,
+                                               0.15698242,  -1.48339844, -0.47436523, 1.24511719}},
+                  reference_tests::Tensor{Shape{1}, ov::element::f32, std::vector<float>{1.0}}),
+        RMSParams(reference_tests::Tensor{Shape{2, 3, 4},
+                                          ov::element::f32,
+                                          std::vector<float>{
+                                              -64.4375000000,  -596.5000000000, 280.7500000000,  -33.8750000000,
+                                              10.4765625000,   -226.2500000000, 587.5000000000,  160.0000000000,
+                                              -675.5000000000, 401.5000000000,  929.0000000000,  0.0167236328,
+                                              779.0000000000,  -316.7500000000, 134.2500000000,  -144.8750000000,
+                                              -126.5000000000, 553.0000000000,  71.1875000000,   907.5000000000,
+                                              88.4375000000,   -836.0000000000, -267.2500000000, 701.5000000000}},
+                  reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector<int64_t>({-1})},
+                  1e-1,
+                  reference_tests::Tensor{
+                      Shape{2, 3, 4},
+                      ov::element::f16,
+                      std::vector<ov::float16>{-0.0971679688, -2.6972656250, 0.2116699219,  -0.2043457031, 0.0161285400,
+                                               -1.0449218750, 0.4521484375,  0.9853515625,  -0.5551757812, 0.9897460938,
+                                               0.3818359375,  0.0000549555,  0.9018554688,  -1.0996093750, 0.0776977539,
+                                               -0.6708984375, -0.1179199219, 1.5468750000,  0.0331726074,  3.3847656250,
+                                               0.0784912109,  -2.2246093750, -0.1185913086, 2.4902343750}},
+                  reference_tests::Tensor{Shape{4}, ov::element::f32, std::vector<float>{0.5, 1.5, 0.25, 2.0}})),
+    ReferenceRMSLayerTest::getTestCaseName);
diff --git a/src/plugins/template/tests/functional/op_reference/roi_align_rotated.cpp b/src/plugins/template/tests/functional/op_reference/roi_align_rotated.cpp
index 12ba487829b987..40fecda965ca2b 100644
--- a/src/plugins/template/tests/functional/op_reference/roi_align_rotated.cpp
+++ b/src/plugins/template/tests/functional/op_reference/roi_align_rotated.cpp
@@ -102,7 +102,7 @@ class ReferenceROIAlignRotatedTest : public testing::TestWithParam<ROIAlignRotat
         const auto roisIdx = std::make_shared<op::v0::Constant>(params.roiBatchIdxs.type,
                                                                 params.roiBatchIdxs.shape,
                                                                 params.roiBatchIdxs.data.data());
-        const auto roi_align_rot = std::make_shared<op::v14::ROIAlignRotated>(featureMap,
+        const auto roi_align_rot = std::make_shared<op::v15::ROIAlignRotated>(featureMap,
                                                                               coords,
                                                                               roisIdx,
                                                                               params.pooledH,
diff --git a/src/tests/functional/plugin/conformance/subgraphs_dumper/README.md b/src/tests/functional/plugin/conformance/subgraphs_dumper/README.md
index 3f6da9065419b5..b382f14a25c54e 100644
--- a/src/tests/functional/plugin/conformance/subgraphs_dumper/README.md
+++ b/src/tests/functional/plugin/conformance/subgraphs_dumper/README.md
@@ -1,7 +1,7 @@
 # Subgraphs Dumper Tool
 
 The tool is designed to analyse any arbitrary scope of the models in a formats supported by OpenVINO frontends
-to extract and serialize unique operations and patterns from the input models. The criteria for 
+to extract and serialize unique operations and patterns from the input models. The criteria for
 uniqueness and matching are defined by implementation of twon interface classes:
 * `Matcher` defines the rules for dumping operatons to the cache.
 * `Extractor` defines the rules for extracting subgraphs from the models.
@@ -16,11 +16,11 @@ uniqueness and matching are defined by implementation of twon interface classes:
 * `read value & assign` extracts stateful graphs.
 
 > NOTE:
-> Please check the following architecture [diagram](./../../../../../../docs/sphinx_setup/_static/images/img/subgraphs_dumper_arch_diaram.png) to get detailed information.
+> Please check the following architecture [diagram](../../../../../../docs/articles_en/assets/images/subgraphs_dumper_arch_diaram.png) to get detailed information.
 
 ## Build
 
-To build the tool, run the following commands: 
+To build the tool, run the following commands:
 ```
 cmake -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_TESTS=ON .
 make --jobs=$(nproc --all) ov_subgraphs_dumper
@@ -28,7 +28,7 @@ make --jobs=$(nproc --all) ov_subgraphs_dumper
 The outcome of a build is a `ov_subgraphs_dumper` binary located in the building artifacts folder.
 
 ## Run
-The tool takes only one required command-line parameter:    
+The tool takes only one required command-line parameter:
 * `--input_folders` - Required. Comma separated paths to the input folders with models in Intermediate Representation format (IRs). The separator is `,`.
 * `--output_folder` - Optinal. Path to the output folders where the IRs will be serialized. Default value is "output".
 * `--local_cache` - Optional. Comma-separated paths to the local cache folders with IRs. The separator is `,`.
@@ -36,7 +36,7 @@ The tool takes only one required command-line parameter:
 * `--extract_body` - Optional. Allows extracting operation bodies to the operation cache.
 * `--cache_type` - Optional. Allows extracting Operations, Subgraphs, or both types. The default value is `OP` and `GRAPH`.
 
-Example running command:   
+Example running command:
 ```ov_subgraphs_dumper --input_folders /dir_0/to/models,/dir_1/to/models --output_folder /path/to/dir```
 
 ## Extraction Algorithm
@@ -55,4 +55,4 @@ make ov_subgraphs_dumper_tests
 ```
 
 ## Architecture Diagram
-![SubgraphsDumper Architecture Diagram](./../../../../../../docs/sphinx_setup/_static/images/img/subgraphs_dumper_arch_diaram.png)
+![SubgraphsDumper Architecture Diagram](../../../../../../docs/articles_en/assets/images/subgraphs_dumper_arch_diaram.png)
diff --git a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp
index f9a4a1f70e017c..98c3d234914be8 100644
--- a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp
+++ b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp
@@ -961,7 +961,7 @@ std::shared_ptr<ov::Model> generate(const std::shared_ptr<ov::op::v9::ROIAlign>&
     return std::make_shared<ov::Model>(results, params, "ROIAlignGraph");
 }
 
-std::shared_ptr<ov::Model> generate(const std::shared_ptr<ov::op::v14::ROIAlignRotated>& node) {
+std::shared_ptr<ov::Model> generate(const std::shared_ptr<ov::op::v15::ROIAlignRotated>& node) {
     ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{{1, 1, 16, 16}})};
     const auto coords = std::make_shared<ov::op::v0::Constant>(
         ov::element::f32,
@@ -969,7 +969,7 @@ std::shared_ptr<ov::Model> generate(const std::shared_ptr<ov::op::v14::ROIAlignR
         std::vector<float>(node->get_rois_input_second_dim_size(), 0));
     const auto roisIdx =
         std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int32_t>{0});
-    auto new_node = std::make_shared<ov::op::v14::ROIAlignRotated>(params.at(0), coords, roisIdx, 2, 2, 2, 1, true);
+    auto new_node = std::make_shared<ov::op::v15::ROIAlignRotated>(params.at(0), coords, roisIdx, 2, 2, 2, 1, true);
     ov::ResultVector results{std::make_shared<ov::op::v0::Result>(new_node)};
     return std::make_shared<ov::Model>(results, params, "ROIAlignRotatedGraph");
 }
diff --git a/src/tests/functional/shared_test_classes/src/subgraph/weights_decompression_builders.cpp b/src/tests/functional/shared_test_classes/src/subgraph/weights_decompression_builders.cpp
index 6f86e420b1a71e..38e45065c43b47 100644
--- a/src/tests/functional/shared_test_classes/src/subgraph/weights_decompression_builders.cpp
+++ b/src/tests/functional/shared_test_classes/src/subgraph/weights_decompression_builders.cpp
@@ -200,9 +200,11 @@ std::shared_ptr<ov::Node> initGatherDecompressionSubgraph(const ov::Shape& data_
         original_data_shape[data_idx] = data_shape[1] / group_size;
         original_data_shape.insert(original_data_shape.begin() + data_idx + 1, group_size);
     }
-    ov::test::utils::InputGenerateData generate_data;
+
+    const auto up_to = data_precision == ov::element::i4 ? 7 : 15;
+    ov::test::utils::InputGenerateData generate_data(0, up_to);
     if (data_precision.is_signed())
-        generate_data.start_from = -5;
+        generate_data.start_from = -1;
     auto weights_tensor = ov::test::utils::create_and_fill_tensor(data_precision, original_data_shape, generate_data);
     auto weights = std::make_shared<ov::op::v0::Constant>(weights_tensor);
     weights->set_friendly_name("Compressed_weights");
@@ -226,7 +228,7 @@ std::shared_ptr<ov::Node> initGatherDecompressionSubgraph(const ov::Shape& data_
                                      scaleshift_const_shape.end());
     if (add_subtract) {
         auto shift_tensor_shape = per_tensor_zp ? ov::Shape{1} : scaleshift_const_shape;
-        auto shift_tensor = ov::test::utils::create_and_fill_tensor(data_precision, shift_tensor_shape);
+        auto shift_tensor = ov::test::utils::create_and_fill_tensor(data_precision, shift_tensor_shape, ov::test::utils::InputGenerateData(0, up_to));
         if (per_tensor_zp && data_precision.bitwidth() == 4) {
             static_cast<uint8_t*>(shift_tensor.data())[0] = 0x88;
         }
diff --git a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py
index 6662bb78ca0cdd..e6291ef566eaf8 100644
--- a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py
+++ b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py
@@ -98,7 +98,7 @@ def numpy_to_torch_recursively(x):
         ov_inputs = flattenize_inputs(inputs)
 
         if self.use_torch_compile_backend():
-            self.torch_compile_backend_test(model, torch_inputs, custom_eps)
+            self.torch_compile_backend_test(model, torch_inputs, custom_eps, **kwargs)
         else:
             if self.use_torch_export():
                 from openvino import convert_model
@@ -262,7 +262,7 @@ def _resolve_input_shape_dtype(self, om, ov_inputs, dynamic_shapes):
         om.validate_nodes_and_infer_types()
         return om
 
-    def torch_compile_backend_test(self, model, inputs, custom_eps):
+    def torch_compile_backend_test(self, model, inputs, custom_eps, **kwargs):
         torch._dynamo.reset()
         with torch.no_grad():
             model.eval()
@@ -271,8 +271,15 @@ def torch_compile_backend_test(self, model, inputs, custom_eps):
         torch._dynamo.reset()
         with torch.no_grad():
             model.eval()
+            options={"testing": 1,}
+            if ("aot_autograd" in kwargs):
+                options.update({"aot_autograd": True,})
+            dynamic = False
+            if ("dynamic" in kwargs):
+                dynamic = kwargs["dynamic"]
+
             ov_model = torch.compile(
-                model, backend="openvino", options={"testing": 1})
+                model, backend="openvino", dynamic=dynamic, options=options)
             ov_res = ov_model(*inputs)
 
         if not isinstance(fw_res, (tuple)):
diff --git a/tests/layer_tests/pytorch_tests/test_expand.py b/tests/layer_tests/pytorch_tests/test_expand.py
index 659fa70d17a5f7..4ef275ae0d4bfd 100644
--- a/tests/layer_tests/pytorch_tests/test_expand.py
+++ b/tests/layer_tests/pytorch_tests/test_expand.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+import random
 
 from pytorch_layer_test_class import PytorchLayerTest
 
@@ -36,6 +37,7 @@ def forward_broadcast(self, x):
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
+    @pytest.mark.precommit_fx_backend
     def test_expand(self, dims, op_type, ie_device, precision, ir_version):
         self._test(*self.create_model(dims, op_type), ie_device, precision, ir_version)
 
@@ -70,6 +72,7 @@ def forward_broadcast(self, x, y):
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
+    @pytest.mark.precommit_fx_backend
     def test_expand(self, dims, op_type, ie_device, precision, ir_version):
         self._test(*self.create_model(op_type), ie_device, precision, ir_version, kwargs_to_prepare_input={"broadcast_shape": dims})
 
@@ -110,3 +113,29 @@ def forward(self, x, y):
     def test_expand(self, ie_device, precision, ir_version, kwargs_to_prepare_input):
         self._test(*self.create_model(), ie_device, precision,
                    ir_version, kwargs_to_prepare_input=kwargs_to_prepare_input)
+
+class TestDynamicExpand(PytorchLayerTest):
+    def _prepare_input(self):
+        import numpy as np
+        last_dym = random.randint(1,4)
+        return (np.random.randn(1, 3, 1).astype(np.float32), last_dym)
+
+    def create_model(self, dim):
+        import torch
+
+        class aten_expand(torch.nn.Module):
+            def __init__(self, dims):
+                super(aten_expand, self).__init__()
+                self.dims = dims
+
+            def forward(self, x, dym):
+                return x.expand((self.dims+(dym,)))
+
+        ref_net = None
+
+        return aten_expand(dim), ref_net, f"aten::expand"
+
+    @pytest.mark.parametrize("dims", [(4, 3), (-1, -1)])
+    @pytest.mark.precommit_fx_backend
+    def test_dynamic_expand(self, dims, ie_device, precision, ir_version):
+        self._test(*self.create_model(dims), ie_device, precision, ir_version, dynamic=True, aot_autograd=True)
diff --git a/tests/layer_tests/pytorch_tests/test_reshape.py b/tests/layer_tests/pytorch_tests/test_reshape.py
index 8cddf05aab0211..0498d410600b27 100644
--- a/tests/layer_tests/pytorch_tests/test_reshape.py
+++ b/tests/layer_tests/pytorch_tests/test_reshape.py
@@ -3,13 +3,14 @@
 
 import numpy as np
 import pytest
+import random
 
 from pytorch_layer_test_class import PytorchLayerTest
 
 
 class TestReshape(PytorchLayerTest):
     def _prepare_input(self):
-        return (np.random.uniform(0, 50, (1, 12, 12, 24)).astype(np.float32),)
+        return (np.random.uniform(0, 50, (1, 12, 12, 24)).astype(np.float32))
 
     def create_model(self, shape):
         import torch
@@ -39,5 +40,37 @@ def forward(self, x):
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
+    @pytest.mark.precommit_fx_backend
     def test_reshape(self, shape, ie_device, precision, ir_version):
         self._test(*self.create_model(shape), ie_device, precision, ir_version)
+
+class TestDynamicReshape(PytorchLayerTest):
+    def _prepare_input(self):
+        last_dym = random.randint(1,2)
+        return (np.random.uniform(0, 50, (1, 12, 12, 24)).astype(np.float32), last_dym)
+
+    def create_model(self, shape):
+        import torch
+
+        class aten_reshape(torch.nn.Module):
+            def __init__(self, shape):
+                super(aten_reshape, self).__init__()
+                self.shape = shape
+
+            def forward(self, x, dym):
+                #return torch.reshape(x, self.shape)
+                dym2 = int(torch.ops.aten.sym_size(x, 3)/dym)
+                return torch.reshape(x, [12, 12, dym2, dym])
+
+        ref_net = None
+
+        return aten_reshape(shape), ref_net, "aten::reshape"
+
+    @pytest.mark.parametrize(("shape"), [
+        [12, 12, 24, 1],
+        [12, 12, 12, 2],
+        [24, 12, 12, 1],
+    ])
+    @pytest.mark.precommit_fx_backend
+    def test_dynamic_reshape(self, shape, ie_device, precision, ir_version):
+        self._test(*self.create_model(shape), ie_device, precision, ir_version, aot_autograd=True, dynamic=True)