diff --git a/.github/dockerfiles/docker_tag b/.github/dockerfiles/docker_tag index 094e08dade5967..7a82a65ff487c6 100644 --- a/.github/dockerfiles/docker_tag +++ b/.github/dockerfiles/docker_tag @@ -1 +1 @@ -pr-24598 +pr-24573 \ No newline at end of file diff --git a/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile new file mode 100644 index 00000000000000..b13bfe1f2df316 --- /dev/null +++ b/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile @@ -0,0 +1,72 @@ +FROM openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04 + +USER root + +# APT configuration +RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \ + echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf + +ENV DEBIAN_FRONTEND="noninteractive" \ + TZ="Europe/London" + +RUN apt-get update && \ + apt-get install software-properties-common && \ + add-apt-repository --yes --no-update ppa:git-core/ppa && \ + add-apt-repository --yes --no-update ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install \ + curl \ + git \ + ca-certificates \ + gpg-agent \ + tzdata \ + # Pythons + python3.8-dev \ + python3.8-venv \ + python3.8-distutils \ + python3.11-dev \ + python3.11-venv \ + python3.11-distutils \ + # For Java API + default-jdk \ + # Compiler \ + gcc-10 \ + g++-10 \ + && \ + rm -rf /var/lib/apt/lists/* + +# Install build dependencies +ADD install_build_dependencies.sh /install_build_dependencies.sh +RUN chmod +x /install_build_dependencies.sh && \ + /install_build_dependencies.sh && \ + rm -rf /var/lib/apt/lists/* + +# Set gcc-10 as a default compiler +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30 + +# Install sscache +ARG SCCACHE_VERSION="v0.7.5" +ENV SCCACHE_HOME="/opt/sccache" \ + SCCACHE_PATH="/opt/sccache/sccache" + +RUN mkdir ${SCCACHE_HOME} && cd ${SCCACHE_HOME} && \ + SCCACHE_ARCHIVE="sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" && \ + curl -SLO https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/${SCCACHE_ARCHIVE} && \ + tar -xzf ${SCCACHE_ARCHIVE} --strip-components=1 && rm ${SCCACHE_ARCHIVE} + +# Setup pip +ENV PIP_VERSION="24.0" +RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3.8 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + python3.11 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + rm -f get-pip.py + +# Use Python 3.11 as default instead of Python 3.8 +# Using venv here 'cause other methods to switch the default Python on Ubuntu 20 break both system and wheels build +RUN python3.11 -m venv venv +ENV PATH="/venv/bin:$SCCACHE_HOME:$PATH" + +ENV PIP_CACHE_DIR=/mount/caches/pip/linux/${PIP_VERSION} diff --git a/.github/dockerfiles/ov_build/ubuntu_22_04_x64/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_22_04_x64/Dockerfile new file mode 100644 index 00000000000000..cb3e4cc639e0a9 --- /dev/null +++ b/.github/dockerfiles/ov_build/ubuntu_22_04_x64/Dockerfile @@ -0,0 +1,73 @@ +FROM openvinogithubactions.azurecr.io/dockerhub/ubuntu:22.04 + +USER root + +# APT configuration +RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \ + echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf + +ENV DEBIAN_FRONTEND="noninteractive" \ + TZ="Europe/London" + +RUN apt-get update && \ + apt-get install software-properties-common && \ + add-apt-repository --yes --no-update ppa:git-core/ppa && \ + add-apt-repository --yes --no-update ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install \ + curl \ + git \ + ca-certificates \ + gpg-agent \ + tzdata \ + libtbb2 \ + # Pythons + python3.8-dev \ + python3.8-venv \ + python3.8-distutils \ + python3.11-dev \ + python3.11-venv \ + python3.11-distutils \ + # For Java API + default-jdk \ + # Compiler \ + gcc-10 \ + g++-10 \ + && \ + rm -rf /var/lib/apt/lists/* + +# Install build dependencies +ADD install_build_dependencies.sh /install_build_dependencies.sh +RUN chmod +x /install_build_dependencies.sh && \ + /install_build_dependencies.sh && \ + rm -rf /var/lib/apt/lists/* + +# Set gcc-10 as a default compiler +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30 + +# Install sscache +ARG SCCACHE_VERSION="v0.7.5" +ENV SCCACHE_HOME="/opt/sccache" \ + SCCACHE_PATH="/opt/sccache/sccache" + +RUN mkdir ${SCCACHE_HOME} && cd ${SCCACHE_HOME} && \ + SCCACHE_ARCHIVE="sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" && \ + curl -SLO https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/${SCCACHE_ARCHIVE} && \ + tar -xzf ${SCCACHE_ARCHIVE} --strip-components=1 && rm ${SCCACHE_ARCHIVE} + +# Setup pip +ENV PIP_VERSION="24.0" +RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3.8 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + python3.11 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + rm -f get-pip.py + +# Use Python 3.11 as default instead of Python 3.8 +# Using venv here 'cause other methods to switch the default Python on Ubuntu 20 break both system and wheels build +RUN python3.11 -m venv venv +ENV PATH="/venv/bin:$SCCACHE_HOME:$PATH" + +ENV PIP_CACHE_DIR=/mount/caches/pip/linux/${PIP_VERSION} diff --git a/.github/workflows/job_onnx_runtime.yml b/.github/workflows/job_onnx_runtime.yml index 31420d793b14cd..b1d7060b6bce33 100644 --- a/.github/workflows/job_onnx_runtime.yml +++ b/.github/workflows/job_onnx_runtime.yml @@ -40,10 +40,6 @@ jobs: ONNX_RUNTIME_UTILS: ${{ github.workspace }}/install/onnxruntime ONNX_RUNTIME_BUILD_DIR: ${{ github.workspace }}/onnxruntime/build steps: - - name: Set apt retries - if: runner.os == 'Linux' - run: echo 'Acquire::Retries "10";' > /etc/apt/apt.conf.d/80-retries - - name: Download OpenVINO package uses: actions/download-artifact@v4 with: @@ -59,38 +55,12 @@ jobs: echo "ONNX_RUNTIME_UTILS=$GITHUB_WORKSPACE/install/onnxruntime" >> "$GITHUB_ENV" echo "ONNX_RUNTIME_BUILD_DIR=$GITHUB_WORKSPACE/onnxruntime/build" >> "$GITHUB_ENV" - - name: Fetch install_build_dependencies.sh and setup_python action - uses: actions/checkout@v4 - with: - sparse-checkout: | - install_build_dependencies.sh - .github/actions/setup_python/action.yml - sparse-checkout-cone-mode: false - path: 'openvino' - - - name: Install git - run: | - apt-get update - apt-get install --assume-yes --no-install-recommends git ca-certificates - - - name: Setup Python ${{ env.PYTHON_VERSION }} - uses: ./openvino/.github/actions/setup_python - with: - version: '3.11' - should-setup-pip-paths: 'false' - - name: Extract OpenVINO package run: | pushd ${INSTALL_DIR} tar -xzf openvino_package.tar.gz -C ${INSTALL_DIR} popd - - name: Install OpenVINO dependencies - run: | - ${INSTALL_DIR}/install_dependencies/install_openvino_dependencies.sh -c=core -c=dev -y - # since we are on Ubuntu 22.04, but compiled OpenVINO on Ubuntu 20.04, we need to install `libtbb2` - apt-get install --assume-yes --no-install-recommends libtbb2 - - name: Clone ONNX Runtime run: | hash=`tr -s '\n ' < ${ONNX_RUNTIME_UTILS}/version` @@ -102,14 +72,6 @@ jobs: # Tests # - - name: Install Build Dependencies - run: bash ${OPENVINO_REPO}/install_build_dependencies.sh - - - name: Install sccache - uses: mozilla-actions/sccache-action@v0.0.4 - with: - version: "v0.7.5" - - name: Build Lin ONNX Runtime run: | source ${INSTALL_DIR}/setupvars.sh @@ -133,7 +95,7 @@ jobs: if: ${{ runner.arch != 'ARM64' }} # Ticket: 126277 run: | # see https://github.com/microsoft/onnxruntime/issues/13197#issuecomment-1264542497 - apt-get install --assume-yes --no-install-recommends language-pack-en + apt-get update && apt-get install --assume-yes --no-install-recommends language-pack-en locale-gen en_US.UTF-8 update-locale LANG=en_US.UTF-8 diff --git a/.github/workflows/job_tokenizers.yml b/.github/workflows/job_tokenizers.yml index 23eca8cd5bb32a..5198ee5db996ae 100644 --- a/.github/workflows/job_tokenizers.yml +++ b/.github/workflows/job_tokenizers.yml @@ -56,6 +56,7 @@ jobs: install_build_dependencies.sh - name: Setup Python ${{ env.PYTHON_VERSION }} + if: ${{ runner.os != 'Linux' }} # We do not need to install Python on Linux as we use Docker with it installed uses: ./.github/actions/setup_python with: version: ${{ env.PYTHON_VERSION }} @@ -94,10 +95,6 @@ jobs: # Dependencies # - - name: Install build dependencies (Linux) - if: runner.os == 'Linux' - run: ./install_build_dependencies.sh - - name: Install python dependencies run: | # wheel packaging diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 5b4e6769cc1350..6b43a90fb9f61a 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -25,6 +25,7 @@ jobs: runs-on: ubuntu-latest outputs: affected_components: "${{ steps.smart_ci.outputs.affected_components }}" + changed_components: "${{ steps.smart_ci.outputs.changed_components }}" skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action @@ -45,15 +46,42 @@ jobs: skip_when_only_listed_labels_set: 'docs' skip_when_only_listed_files_changed: '*.md,*.rst,*.png,*.jpg,*.svg' - Build: + - name: Show affected components + run: | + echo "${{ toJSON(steps.smart_ci.outputs.affected_components) }}" + shell: bash + + Docker: needs: Smart_CI + runs-on: aks-linux-4-cores-16gb-docker-build + container: + image: openvinogithubactions.azurecr.io/docker_build:0.2 + volumes: + - /mount:/mount + outputs: + images: "${{ steps.handle_docker.outputs.images }}" + steps: + - name: Checkout + uses: actions/checkout@v4 + + - uses: ./.github/actions/handle_docker + id: handle_docker + with: + images: | + ov_build/ubuntu_20_04_x64 + registry: 'openvinogithubactions.azurecr.io' + dockerfiles_root_dir: '.github/dockerfiles' + changed_components: ${{ needs.smart_ci.outputs.changed_components }} + + Build: + needs: [Docker, Smart_CI] timeout-minutes: 150 defaults: run: shell: bash runs-on: aks-linux-16-cores-32gb container: - image: openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04 + image: ${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_x64 }} volumes: - /mount:/mount options: -e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING @@ -80,20 +108,21 @@ jobs: if: "!needs.smart_ci.outputs.skip_workflow" steps: - - name: Set apt retries - run: echo 'Acquire::Retries "10";' > /etc/apt/apt.conf.d/80-retries - - - name: Install git - run: | - apt-get update - apt-get install --assume-yes --no-install-recommends git ca-certificates - - name: Clone OpenVINO uses: actions/checkout@v4 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' + # Ticket: 139627 + - name: Checkout the latest OneDNN for GPU in nightly + if: ${{ github.event_name == 'schedule' }} + working-directory: ${{ env.OPENVINO_REPO }}/src/plugins/intel_gpu/thirdparty/onednn_gpu + run: | + git fetch origin + git checkout main + git rev-parse HEAD + - name: Clone OpenVINO Contrib uses: actions/checkout@v4 with: @@ -113,26 +142,6 @@ jobs: # Dependencies # - - name: Install build dependencies - run: | - bash ${OPENVINO_REPO}/install_build_dependencies.sh - # default-jdk - Java API - apt install --assume-yes --no-install-recommends default-jdk - - - name: Install sccache - uses: mozilla-actions/sccache-action@v0.0.4 - with: - version: "v0.7.5" - - - name: Setup Python ${{ env.PYTHON_VERSION }} - uses: ./openvino/.github/actions/setup_python - with: - version: ${{ env.PYTHON_VERSION }} - pip-cache-path: ${{ env.PIP_CACHE_PATH }} - should-setup-pip-paths: 'true' - self-hosted-runner: 'true' - show-cache-info: 'true' - - name: Install python dependencies run: | # For Python API: build and wheel packaging @@ -450,12 +459,12 @@ jobs: name: ONNX Runtime Integration if: fromJSON(needs.smart_ci.outputs.affected_components).ONNX_RT || fromJSON(needs.smart_ci.outputs.affected_components).ONNX_FE - needs: [ Build, Smart_CI ] + needs: [ Build, Smart_CI, Docker ] uses: ./.github/workflows/job_onnx_runtime.yml with: runner: 'aks-linux-16-cores-32gb' - container: '{"image": "openvinogithubactions.azurecr.io/dockerhub/ubuntu:22.04", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' - sccache-azure-key-prefix: 'ubuntu22_x86_64_onnxruntime' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_x64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' + sccache-azure-key-prefix: 'ubuntu20_x86_64_onnxruntime' ONNX_Models: name: ONNX Models Tests @@ -676,12 +685,12 @@ jobs: Openvino_tokenizers: name: OpenVINO tokenizers extension - needs: [ Build, Smart_CI ] + needs: [ Build, Smart_CI, Docker ] uses: ./.github/workflows/job_tokenizers.yml with: runner: 'aks-linux-4-cores-16gb' shell: bash - container: '{"image": "openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04", "volumes": ["/mount:/mount"]}' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_x64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 58d3184d8cf276..2d50900d157e3e 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -323,12 +323,12 @@ jobs: name: ONNX Runtime Integration if: fromJSON(needs.smart_ci.outputs.affected_components).ONNX_RT || fromJSON(needs.smart_ci.outputs.affected_components).ONNX_FE - needs: [ Build, Smart_CI ] + needs: [ Build, Smart_CI, Docker ] uses: ./.github/workflows/job_onnx_runtime.yml with: runner: 'aks-linux-16-cores-arm' - container: '{"image": "openvinogithubactions.azurecr.io/dockerhub/ubuntu:22.04", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' - sccache-azure-key-prefix: 'ubuntu22_aarch64_onnxruntime' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' + sccache-azure-key-prefix: 'ubuntu20_aarch64_onnxruntime' Openvino_tokenizers: name: OpenVINO tokenizers extension diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst index aef6b8be5c6d11..bb874ea459380d 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks.rst @@ -163,7 +163,7 @@ connection is dedicated only to measuring performance. The benchmark setup for OVMS consists of four main parts: - .. image:: ../_static/images/performance_benchmarks_ovms_02.png + .. image:: ../assets/images/performance_benchmarks_ovms_02.png :alt: OVMS Benchmark Setup Diagram * **OpenVINO™ Model Server** is launched as a docker container on the server platform and it diff --git a/docs/sphinx_setup/_static/images/BASIC_FLOW_IE_C.svg b/docs/articles_en/assets/images/BASIC_FLOW_IE_C.svg similarity index 100% rename from docs/sphinx_setup/_static/images/BASIC_FLOW_IE_C.svg rename to docs/articles_en/assets/images/BASIC_FLOW_IE_C.svg diff --git a/docs/sphinx_setup/_static/images/DEVELOPMENT_FLOW_V3_crunch.svg b/docs/articles_en/assets/images/DEVELOPMENT_FLOW_V3_crunch.svg similarity index 100% rename from docs/sphinx_setup/_static/images/DEVELOPMENT_FLOW_V3_crunch.svg rename to docs/articles_en/assets/images/DEVELOPMENT_FLOW_V3_crunch.svg diff --git a/docs/sphinx_setup/_static/images/DeepSpeech-0.8.2.png b/docs/articles_en/assets/images/DeepSpeech-0.8.2.png similarity index 100% rename from docs/sphinx_setup/_static/images/DeepSpeech-0.8.2.png rename to docs/articles_en/assets/images/DeepSpeech-0.8.2.png diff --git a/docs/sphinx_setup/_static/images/DeviceDriverVersion.svg b/docs/articles_en/assets/images/DeviceDriverVersion.svg similarity index 100% rename from docs/sphinx_setup/_static/images/DeviceDriverVersion.svg rename to docs/articles_en/assets/images/DeviceDriverVersion.svg diff --git a/docs/sphinx_setup/_static/images/DeviceManager.PNG b/docs/articles_en/assets/images/DeviceManager.PNG similarity index 100% rename from docs/sphinx_setup/_static/images/DeviceManager.PNG rename to docs/articles_en/assets/images/DeviceManager.PNG diff --git a/docs/sphinx_setup/_static/images/FaceNet.svg b/docs/articles_en/assets/images/FaceNet.svg similarity index 100% rename from docs/sphinx_setup/_static/images/FaceNet.svg rename to docs/articles_en/assets/images/FaceNet.svg diff --git a/docs/sphinx_setup/_static/images/IMPLEMENT_PIPELINE_with_API_C.svg b/docs/articles_en/assets/images/IMPLEMENT_PIPELINE_with_API_C.svg similarity index 100% rename from docs/sphinx_setup/_static/images/IMPLEMENT_PIPELINE_with_API_C.svg rename to docs/articles_en/assets/images/IMPLEMENT_PIPELINE_with_API_C.svg diff --git a/docs/sphinx_setup/_static/images/MO_connection_example_1.svg b/docs/articles_en/assets/images/MO_connection_example_1.svg similarity index 100% rename from docs/sphinx_setup/_static/images/MO_connection_example_1.svg rename to docs/articles_en/assets/images/MO_connection_example_1.svg diff --git a/docs/sphinx_setup/_static/images/MO_conversion_pipeline.svg b/docs/articles_en/assets/images/MO_conversion_pipeline.svg similarity index 100% rename from docs/sphinx_setup/_static/images/MO_conversion_pipeline.svg rename to docs/articles_en/assets/images/MO_conversion_pipeline.svg diff --git a/docs/sphinx_setup/_static/images/MO_graph_after_extractors.svg b/docs/articles_en/assets/images/MO_graph_after_extractors.svg similarity index 100% rename from docs/sphinx_setup/_static/images/MO_graph_after_extractors.svg rename to docs/articles_en/assets/images/MO_graph_after_extractors.svg diff --git a/docs/sphinx_setup/_static/images/MO_graph_after_loader.svg b/docs/articles_en/assets/images/MO_graph_after_loader.svg similarity index 100% rename from docs/sphinx_setup/_static/images/MO_graph_after_loader.svg rename to docs/articles_en/assets/images/MO_graph_after_loader.svg diff --git a/docs/sphinx_setup/_static/images/MO_graph_before_partial_inference.svg b/docs/articles_en/assets/images/MO_graph_before_partial_inference.svg similarity index 100% rename from docs/sphinx_setup/_static/images/MO_graph_before_partial_inference.svg rename to docs/articles_en/assets/images/MO_graph_before_partial_inference.svg diff --git a/docs/sphinx_setup/_static/images/MO_ports_example_1.svg b/docs/articles_en/assets/images/MO_ports_example_1.svg similarity index 100% rename from docs/sphinx_setup/_static/images/MO_ports_example_1.svg rename to docs/articles_en/assets/images/MO_ports_example_1.svg diff --git a/docs/sphinx_setup/_static/images/MO_ports_example_2.svg b/docs/articles_en/assets/images/MO_ports_example_2.svg similarity index 100% rename from docs/sphinx_setup/_static/images/MO_ports_example_2.svg rename to docs/articles_en/assets/images/MO_ports_example_2.svg diff --git a/docs/sphinx_setup/_static/images/MO_transformations_graph.svg b/docs/articles_en/assets/images/MO_transformations_graph.svg similarity index 100% rename from docs/sphinx_setup/_static/images/MO_transformations_graph.svg rename to docs/articles_en/assets/images/MO_transformations_graph.svg diff --git a/docs/sphinx_setup/_static/images/NCF_start.svg b/docs/articles_en/assets/images/NCF_start.svg similarity index 100% rename from docs/sphinx_setup/_static/images/NCF_start.svg rename to docs/articles_en/assets/images/NCF_start.svg diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png similarity index 100% rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png similarity index 100% rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png similarity index 100% rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png similarity index 100% rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png diff --git a/docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png b/docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png similarity index 100% rename from docs/sphinx_setup/_static/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png rename to docs/articles_en/assets/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png diff --git a/docs/sphinx_setup/_static/images/WHAT_TO_USE.svg b/docs/articles_en/assets/images/WHAT_TO_USE.svg similarity index 100% rename from docs/sphinx_setup/_static/images/WHAT_TO_USE.svg rename to docs/articles_en/assets/images/WHAT_TO_USE.svg diff --git a/docs/sphinx_setup/_static/images/add.common.png b/docs/articles_en/assets/images/add.common.png similarity index 100% rename from docs/sphinx_setup/_static/images/add.common.png rename to docs/articles_en/assets/images/add.common.png diff --git a/docs/sphinx_setup/_static/images/add.transformed.png b/docs/articles_en/assets/images/add.transformed.png similarity index 100% rename from docs/sphinx_setup/_static/images/add.transformed.png rename to docs/articles_en/assets/images/add.transformed.png diff --git a/docs/sphinx_setup/_static/images/applying_low_latency_2.svg b/docs/articles_en/assets/images/applying_low_latency_2.svg similarity index 100% rename from docs/sphinx_setup/_static/images/applying_low_latency_2.svg rename to docs/articles_en/assets/images/applying_low_latency_2.svg diff --git a/docs/sphinx_setup/_static/images/autoplugin_accelerate.svg b/docs/articles_en/assets/images/autoplugin_accelerate.svg similarity index 100% rename from docs/sphinx_setup/_static/images/autoplugin_accelerate.svg rename to docs/articles_en/assets/images/autoplugin_accelerate.svg diff --git a/docs/sphinx_setup/_static/images/batch_device.svg b/docs/articles_en/assets/images/batch_device.svg similarity index 100% rename from docs/sphinx_setup/_static/images/batch_device.svg rename to docs/articles_en/assets/images/batch_device.svg diff --git a/docs/sphinx_setup/_static/images/batch_relaxation.png b/docs/articles_en/assets/images/batch_relaxation.png similarity index 100% rename from docs/sphinx_setup/_static/images/batch_relaxation.png rename to docs/articles_en/assets/images/batch_relaxation.png diff --git a/docs/sphinx_setup/_static/images/caching_enabled.svg b/docs/articles_en/assets/images/caching_enabled.svg similarity index 100% rename from docs/sphinx_setup/_static/images/caching_enabled.svg rename to docs/articles_en/assets/images/caching_enabled.svg diff --git a/docs/sphinx_setup/_static/images/caching_times.svg b/docs/articles_en/assets/images/caching_times.svg similarity index 100% rename from docs/sphinx_setup/_static/images/caching_times.svg rename to docs/articles_en/assets/images/caching_times.svg diff --git a/docs/sphinx_setup/_static/images/compressed_int8_Convolution_weights.png b/docs/articles_en/assets/images/compressed_int8_Convolution_weights.png similarity index 100% rename from docs/sphinx_setup/_static/images/compressed_int8_Convolution_weights.png rename to docs/articles_en/assets/images/compressed_int8_Convolution_weights.png diff --git a/docs/sphinx_setup/_static/images/cpu_execution_conventional_approach.svg b/docs/articles_en/assets/images/cpu_execution_conventional_approach.svg similarity index 100% rename from docs/sphinx_setup/_static/images/cpu_execution_conventional_approach.svg rename to docs/articles_en/assets/images/cpu_execution_conventional_approach.svg diff --git a/docs/sphinx_setup/_static/images/cpu_execution_streams.svg b/docs/articles_en/assets/images/cpu_execution_streams.svg similarity index 100% rename from docs/sphinx_setup/_static/images/cpu_execution_streams.svg rename to docs/articles_en/assets/images/cpu_execution_streams.svg diff --git a/docs/sphinx_setup/_static/images/cpu_execution_streams_2.svg b/docs/articles_en/assets/images/cpu_execution_streams_2.svg similarity index 100% rename from docs/sphinx_setup/_static/images/cpu_execution_streams_2.svg rename to docs/articles_en/assets/images/cpu_execution_streams_2.svg diff --git a/docs/sphinx_setup/_static/images/datumaro.png b/docs/articles_en/assets/images/datumaro.png similarity index 100% rename from docs/sphinx_setup/_static/images/datumaro.png rename to docs/articles_en/assets/images/datumaro.png diff --git a/docs/sphinx_setup/_static/images/deploy_encrypted_model.svg b/docs/articles_en/assets/images/deploy_encrypted_model.svg similarity index 100% rename from docs/sphinx_setup/_static/images/deploy_encrypted_model.svg rename to docs/articles_en/assets/images/deploy_encrypted_model.svg diff --git a/docs/sphinx_setup/_static/images/deployment_full.svg b/docs/articles_en/assets/images/deployment_full.svg similarity index 100% rename from docs/sphinx_setup/_static/images/deployment_full.svg rename to docs/articles_en/assets/images/deployment_full.svg diff --git a/docs/sphinx_setup/_static/images/deployment_simplified.svg b/docs/articles_en/assets/images/deployment_simplified.svg similarity index 100% rename from docs/sphinx_setup/_static/images/deployment_simplified.svg rename to docs/articles_en/assets/images/deployment_simplified.svg diff --git a/docs/sphinx_setup/_static/images/dog.png b/docs/articles_en/assets/images/dog.png similarity index 100% rename from docs/sphinx_setup/_static/images/dog.png rename to docs/articles_en/assets/images/dog.png diff --git a/docs/sphinx_setup/_static/images/expanded_int8_Convolution_weights.png b/docs/articles_en/assets/images/expanded_int8_Convolution_weights.png similarity index 100% rename from docs/sphinx_setup/_static/images/expanded_int8_Convolution_weights.png rename to docs/articles_en/assets/images/expanded_int8_Convolution_weights.png diff --git a/docs/sphinx_setup/_static/images/fq.common.svg b/docs/articles_en/assets/images/fq.common.svg similarity index 100% rename from docs/sphinx_setup/_static/images/fq.common.svg rename to docs/articles_en/assets/images/fq.common.svg diff --git a/docs/sphinx_setup/_static/images/fq.transformed.svg b/docs/articles_en/assets/images/fq.transformed.svg similarity index 100% rename from docs/sphinx_setup/_static/images/fq.transformed.svg rename to docs/articles_en/assets/images/fq.transformed.svg diff --git a/docs/sphinx_setup/_static/images/fq_and_convolution.common.png b/docs/articles_en/assets/images/fq_and_convolution.common.png similarity index 100% rename from docs/sphinx_setup/_static/images/fq_and_convolution.common.png rename to docs/articles_en/assets/images/fq_and_convolution.common.png diff --git a/docs/sphinx_setup/_static/images/fq_and_convolution.transformed.png b/docs/articles_en/assets/images/fq_and_convolution.transformed.png similarity index 100% rename from docs/sphinx_setup/_static/images/fq_and_convolution.transformed.png rename to docs/articles_en/assets/images/fq_and_convolution.transformed.png diff --git a/docs/sphinx_setup/_static/images/fq_fq_and_convolution.common.png b/docs/articles_en/assets/images/fq_fq_and_convolution.common.png similarity index 100% rename from docs/sphinx_setup/_static/images/fq_fq_and_convolution.common.png rename to docs/articles_en/assets/images/fq_fq_and_convolution.common.png diff --git a/docs/sphinx_setup/_static/images/get_started_with_cpp.jpg b/docs/articles_en/assets/images/get_started_with_cpp.jpg similarity index 100% rename from docs/sphinx_setup/_static/images/get_started_with_cpp.jpg rename to docs/articles_en/assets/images/get_started_with_cpp.jpg diff --git a/docs/sphinx_setup/_static/images/get_started_with_python.gif b/docs/articles_en/assets/images/get_started_with_python.gif similarity index 100% rename from docs/sphinx_setup/_static/images/get_started_with_python.gif rename to docs/articles_en/assets/images/get_started_with_python.gif diff --git a/docs/sphinx_setup/_static/images/graph_rewrite_efficient_search.png b/docs/articles_en/assets/images/graph_rewrite_efficient_search.png similarity index 100% rename from docs/sphinx_setup/_static/images/graph_rewrite_efficient_search.png rename to docs/articles_en/assets/images/graph_rewrite_efficient_search.png diff --git a/docs/sphinx_setup/_static/images/graph_rewrite_execution.png b/docs/articles_en/assets/images/graph_rewrite_execution.png similarity index 100% rename from docs/sphinx_setup/_static/images/graph_rewrite_execution.png rename to docs/articles_en/assets/images/graph_rewrite_execution.png diff --git a/docs/sphinx_setup/_static/images/inception_v1_first_block.svg b/docs/articles_en/assets/images/inception_v1_first_block.svg similarity index 100% rename from docs/sphinx_setup/_static/images/inception_v1_first_block.svg rename to docs/articles_en/assets/images/inception_v1_first_block.svg diff --git a/docs/sphinx_setup/_static/images/inception_v1_std_input.svg b/docs/articles_en/assets/images/inception_v1_std_input.svg similarity index 100% rename from docs/sphinx_setup/_static/images/inception_v1_std_input.svg rename to docs/articles_en/assets/images/inception_v1_std_input.svg diff --git a/docs/sphinx_setup/_static/images/inception_v1_std_output.svg b/docs/articles_en/assets/images/inception_v1_std_output.svg similarity index 100% rename from docs/sphinx_setup/_static/images/inception_v1_std_output.svg rename to docs/articles_en/assets/images/inception_v1_std_output.svg diff --git a/docs/sphinx_setup/_static/images/large_batch_approach.svg b/docs/articles_en/assets/images/large_batch_approach.svg similarity index 100% rename from docs/sphinx_setup/_static/images/large_batch_approach.svg rename to docs/articles_en/assets/images/large_batch_approach.svg diff --git a/docs/sphinx_setup/_static/images/launch_in_binder.svg b/docs/articles_en/assets/images/launch_in_binder.svg similarity index 100% rename from docs/sphinx_setup/_static/images/launch_in_binder.svg rename to docs/articles_en/assets/images/launch_in_binder.svg diff --git a/docs/sphinx_setup/_static/images/llt2_use_const_initializer.svg b/docs/articles_en/assets/images/llt2_use_const_initializer.svg similarity index 100% rename from docs/sphinx_setup/_static/images/llt2_use_const_initializer.svg rename to docs/articles_en/assets/images/llt2_use_const_initializer.svg diff --git a/docs/sphinx_setup/_static/images/lm_1b.svg b/docs/articles_en/assets/images/lm_1b.svg similarity index 100% rename from docs/sphinx_setup/_static/images/lm_1b.svg rename to docs/articles_en/assets/images/lm_1b.svg diff --git a/docs/sphinx_setup/_static/images/low_latency_limitation_2.svg b/docs/articles_en/assets/images/low_latency_limitation_2.svg similarity index 100% rename from docs/sphinx_setup/_static/images/low_latency_limitation_2.svg rename to docs/articles_en/assets/images/low_latency_limitation_2.svg diff --git a/docs/sphinx_setup/_static/images/low_precision_transformation_pipeline.svg b/docs/articles_en/assets/images/low_precision_transformation_pipeline.svg similarity index 100% rename from docs/sphinx_setup/_static/images/low_precision_transformation_pipeline.svg rename to docs/articles_en/assets/images/low_precision_transformation_pipeline.svg diff --git a/docs/sphinx_setup/_static/images/make_stateful_detailed.png b/docs/articles_en/assets/images/make_stateful_detailed.png similarity index 100% rename from docs/sphinx_setup/_static/images/make_stateful_detailed.png rename to docs/articles_en/assets/images/make_stateful_detailed.png diff --git a/docs/sphinx_setup/_static/images/make_stateful_simple.svg b/docs/articles_en/assets/images/make_stateful_simple.svg similarity index 100% rename from docs/sphinx_setup/_static/images/make_stateful_simple.svg rename to docs/articles_en/assets/images/make_stateful_simple.svg diff --git a/docs/sphinx_setup/_static/images/model_fq_and_convolution.common.svg b/docs/articles_en/assets/images/model_fq_and_convolution.common.svg similarity index 100% rename from docs/sphinx_setup/_static/images/model_fq_and_convolution.common.svg rename to docs/articles_en/assets/images/model_fq_and_convolution.common.svg diff --git a/docs/sphinx_setup/_static/images/model_fq_and_convolution.transformed.svg b/docs/articles_en/assets/images/model_fq_and_convolution.transformed.svg similarity index 100% rename from docs/sphinx_setup/_static/images/model_fq_and_convolution.transformed.svg rename to docs/articles_en/assets/images/model_fq_and_convolution.transformed.svg diff --git a/docs/sphinx_setup/_static/images/model_qdq_and_convolution.common.svg b/docs/articles_en/assets/images/model_qdq_and_convolution.common.svg similarity index 100% rename from docs/sphinx_setup/_static/images/model_qdq_and_convolution.common.svg rename to docs/articles_en/assets/images/model_qdq_and_convolution.common.svg diff --git a/docs/sphinx_setup/_static/images/nncf_workflow.svg b/docs/articles_en/assets/images/nncf_workflow.svg similarity index 100% rename from docs/sphinx_setup/_static/images/nncf_workflow.svg rename to docs/articles_en/assets/images/nncf_workflow.svg diff --git a/docs/sphinx_setup/_static/images/open_in_colab.svg b/docs/articles_en/assets/images/open_in_colab.svg similarity index 100% rename from docs/sphinx_setup/_static/images/open_in_colab.svg rename to docs/articles_en/assets/images/open_in_colab.svg diff --git a/docs/sphinx_setup/_static/images/original_vs_reshaped_model.svg b/docs/articles_en/assets/images/original_vs_reshaped_model.svg similarity index 100% rename from docs/sphinx_setup/_static/images/original_vs_reshaped_model.svg rename to docs/articles_en/assets/images/original_vs_reshaped_model.svg diff --git a/docs/sphinx_setup/_static/images/ov_insert_node.png b/docs/articles_en/assets/images/ov_insert_node.png similarity index 100% rename from docs/sphinx_setup/_static/images/ov_insert_node.png rename to docs/articles_en/assets/images/ov_insert_node.png diff --git a/docs/sphinx_setup/_static/images/ov_replace_node.png b/docs/articles_en/assets/images/ov_replace_node.png similarity index 100% rename from docs/sphinx_setup/_static/images/ov_replace_node.png rename to docs/articles_en/assets/images/ov_replace_node.png diff --git a/docs/sphinx_setup/_static/images/ov_workflow_diagram_convenience.svg b/docs/articles_en/assets/images/ov_workflow_diagram_convenience.svg similarity index 100% rename from docs/sphinx_setup/_static/images/ov_workflow_diagram_convenience.svg rename to docs/articles_en/assets/images/ov_workflow_diagram_convenience.svg diff --git a/docs/sphinx_setup/_static/images/ov_workflow_diagram_performance.svg b/docs/articles_en/assets/images/ov_workflow_diagram_performance.svg similarity index 100% rename from docs/sphinx_setup/_static/images/ov_workflow_diagram_performance.svg rename to docs/articles_en/assets/images/ov_workflow_diagram_performance.svg diff --git a/docs/sphinx_setup/_static/images/ovsa_diagram.svg b/docs/articles_en/assets/images/ovsa_diagram.svg similarity index 100% rename from docs/sphinx_setup/_static/images/ovsa_diagram.svg rename to docs/articles_en/assets/images/ovsa_diagram.svg diff --git a/docs/sphinx_setup/_static/images/ovsa_example.svg b/docs/articles_en/assets/images/ovsa_example.svg similarity index 100% rename from docs/sphinx_setup/_static/images/ovsa_example.svg rename to docs/articles_en/assets/images/ovsa_example.svg diff --git a/docs/sphinx_setup/_static/images/performance_benchmarks_ovms_02.png b/docs/articles_en/assets/images/performance_benchmarks_ovms_02.png similarity index 100% rename from docs/sphinx_setup/_static/images/performance_benchmarks_ovms_02.png rename to docs/articles_en/assets/images/performance_benchmarks_ovms_02.png diff --git a/docs/sphinx_setup/_static/images/preprocess_not_fit.png b/docs/articles_en/assets/images/preprocess_not_fit.png similarity index 100% rename from docs/sphinx_setup/_static/images/preprocess_not_fit.png rename to docs/articles_en/assets/images/preprocess_not_fit.png diff --git a/docs/sphinx_setup/_static/images/qdq_propagation.png b/docs/articles_en/assets/images/qdq_propagation.png similarity index 100% rename from docs/sphinx_setup/_static/images/qdq_propagation.png rename to docs/articles_en/assets/images/qdq_propagation.png diff --git a/docs/sphinx_setup/_static/images/quantization_picture.svg b/docs/articles_en/assets/images/quantization_picture.svg similarity index 100% rename from docs/sphinx_setup/_static/images/quantization_picture.svg rename to docs/articles_en/assets/images/quantization_picture.svg diff --git a/docs/sphinx_setup/_static/images/IE_PLUGIN_DG/images/quantized_convolution.png b/docs/articles_en/assets/images/quantized_convolution.png similarity index 100% rename from docs/sphinx_setup/_static/images/IE_PLUGIN_DG/images/quantized_convolution.png rename to docs/articles_en/assets/images/quantized_convolution.png diff --git a/docs/sphinx_setup/_static/images/IE_PLUGIN_DG/images/quantized_model_example.png b/docs/articles_en/assets/images/quantized_model_example.png similarity index 100% rename from docs/sphinx_setup/_static/images/IE_PLUGIN_DG/images/quantized_model_example.png rename to docs/articles_en/assets/images/quantized_model_example.png diff --git a/docs/sphinx_setup/_static/images/small_IR_graph_demonstration.png b/docs/articles_en/assets/images/small_IR_graph_demonstration.png similarity index 100% rename from docs/sphinx_setup/_static/images/small_IR_graph_demonstration.png rename to docs/articles_en/assets/images/small_IR_graph_demonstration.png diff --git a/docs/sphinx_setup/_static/images/stateful_model_example.svg b/docs/articles_en/assets/images/stateful_model_example.svg similarity index 100% rename from docs/sphinx_setup/_static/images/stateful_model_example.svg rename to docs/articles_en/assets/images/stateful_model_example.svg diff --git a/docs/sphinx_setup/_static/images/stateful_model_init_subgraph.svg b/docs/articles_en/assets/images/stateful_model_init_subgraph.svg similarity index 100% rename from docs/sphinx_setup/_static/images/stateful_model_init_subgraph.svg rename to docs/articles_en/assets/images/stateful_model_init_subgraph.svg diff --git a/docs/sphinx_setup/_static/images/step2_markup1.svg b/docs/articles_en/assets/images/step2_markup1.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step2_markup1.svg rename to docs/articles_en/assets/images/step2_markup1.svg diff --git a/docs/sphinx_setup/_static/images/step2_markup2.svg b/docs/articles_en/assets/images/step2_markup2.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step2_markup2.svg rename to docs/articles_en/assets/images/step2_markup2.svg diff --git a/docs/sphinx_setup/_static/images/step2_markup3.svg b/docs/articles_en/assets/images/step2_markup3.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step2_markup3.svg rename to docs/articles_en/assets/images/step2_markup3.svg diff --git a/docs/sphinx_setup/_static/images/step2_markup4.svg b/docs/articles_en/assets/images/step2_markup4.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step2_markup4.svg rename to docs/articles_en/assets/images/step2_markup4.svg diff --git a/docs/sphinx_setup/_static/images/step2_markup5.svg b/docs/articles_en/assets/images/step2_markup5.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step2_markup5.svg rename to docs/articles_en/assets/images/step2_markup5.svg diff --git a/docs/sphinx_setup/_static/images/step2_markup6.svg b/docs/articles_en/assets/images/step2_markup6.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step2_markup6.svg rename to docs/articles_en/assets/images/step2_markup6.svg diff --git a/docs/sphinx_setup/_static/images/step2_markup7.svg b/docs/articles_en/assets/images/step2_markup7.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step2_markup7.svg rename to docs/articles_en/assets/images/step2_markup7.svg diff --git a/docs/sphinx_setup/_static/images/step2_markup_original.svg b/docs/articles_en/assets/images/step2_markup_original.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step2_markup_original.svg rename to docs/articles_en/assets/images/step2_markup_original.svg diff --git a/docs/sphinx_setup/_static/images/step3_original.svg b/docs/articles_en/assets/images/step3_original.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step3_original.svg rename to docs/articles_en/assets/images/step3_original.svg diff --git a/docs/sphinx_setup/_static/images/step3_transformed.svg b/docs/articles_en/assets/images/step3_transformed.svg similarity index 100% rename from docs/sphinx_setup/_static/images/step3_transformed.svg rename to docs/articles_en/assets/images/step3_transformed.svg diff --git a/docs/sphinx_setup/_static/images/img/subgraphs_dumper_arch_diaram.png b/docs/articles_en/assets/images/subgraphs_dumper_arch_diaram.png similarity index 100% rename from docs/sphinx_setup/_static/images/img/subgraphs_dumper_arch_diaram.png rename to docs/articles_en/assets/images/subgraphs_dumper_arch_diaram.png diff --git a/docs/sphinx_setup/_static/images/synch-vs-asynch.svg b/docs/articles_en/assets/images/synch-vs-asynch.svg similarity index 100% rename from docs/sphinx_setup/_static/images/synch-vs-asynch.svg rename to docs/articles_en/assets/images/synch-vs-asynch.svg diff --git a/docs/sphinx_setup/_static/images/tokenization.svg b/docs/articles_en/assets/images/tokenization.svg similarity index 100% rename from docs/sphinx_setup/_static/images/tokenization.svg rename to docs/articles_en/assets/images/tokenization.svg diff --git a/docs/sphinx_setup/_static/images/torch_compile_backend_openvino.svg b/docs/articles_en/assets/images/torch_compile_backend_openvino.svg similarity index 100% rename from docs/sphinx_setup/_static/images/torch_compile_backend_openvino.svg rename to docs/articles_en/assets/images/torch_compile_backend_openvino.svg diff --git a/docs/sphinx_setup/_static/images/training_extensions_framework.png b/docs/articles_en/assets/images/training_extensions_framework.png similarity index 100% rename from docs/sphinx_setup/_static/images/training_extensions_framework.png rename to docs/articles_en/assets/images/training_extensions_framework.png diff --git a/docs/sphinx_setup/_static/images/transformations_structure.png b/docs/articles_en/assets/images/transformations_structure.png similarity index 100% rename from docs/sphinx_setup/_static/images/transformations_structure.png rename to docs/articles_en/assets/images/transformations_structure.png diff --git a/docs/sphinx_setup/_static/images/view_on_github.svg b/docs/articles_en/assets/images/view_on_github.svg similarity index 100% rename from docs/sphinx_setup/_static/images/view_on_github.svg rename to docs/articles_en/assets/images/view_on_github.svg diff --git a/docs/articles_en/documentation/legacy-features/install-dev-tools.rst b/docs/articles_en/documentation/legacy-features/install-dev-tools.rst index 59e97b69ab7444..6466eb8711a381 100644 --- a/docs/articles_en/documentation/legacy-features/install-dev-tools.rst +++ b/docs/articles_en/documentation/legacy-features/install-dev-tools.rst @@ -153,7 +153,7 @@ For example, to install and configure dependencies required for working with Ten For more details on the openvino-dev PyPI package, see `pypi.org `__ . Step 5. Test the Installation -+++++++++++++++++++++++++++++ +------------------------------ To verify the package is properly installed, run the command below (this may take a few seconds): @@ -173,7 +173,7 @@ Learn more about OpenVINO and use it in your own application by trying out some Get started with Python +++++++++++++++++++++++ -.. image:: ../../_static/images/get_started_with_python.gif +.. image:: ../../assets/images/get_started_with_python.gif :width: 400 Try the `Python Quick Start Example <../../notebooks/vision-monodepth-with-output.html>`__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser. @@ -187,7 +187,7 @@ Visit the :doc:`Tutorials <../../learn-openvino/interactive-tutorials-python>` p Get started with C++ ++++++++++++++++++++ -.. image:: ../../_static/images/get_started_with_cpp.jpg +.. image:: ../../assets/images/get_started_with_cpp.jpg :width: 400 diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst index 917998c7ebaf9c..e2099fdc2b0562 100644 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst +++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst @@ -43,14 +43,14 @@ The input model is converted as a whole if neither ``input`` nor ``output`` comm For Inception_V1, there is one ``Placeholder``: input. If the model is viewed in TensorBoard, the input operation is easy to find: -.. image:: ../../../../_static/images/inception_v1_std_input.svg +.. image:: ../../../../assets/images/inception_v1_std_input.svg :alt: Placeholder in Inception V1 ``Reshape`` is the only output operation, which is enclosed in a nested name scope of ``InceptionV1/Logits/Predictions``, under the full name of ``InceptionV1/Logits/Predictions/Reshape_1``. In TensorBoard, along with some of its predecessors, it looks as follows: -.. image:: ../../../../_static/images/inception_v1_std_output.svg +.. image:: ../../../../assets/images/inception_v1_std_output.svg :alt: TensorBoard with predecessors Convert this model to ``ov.Model``: @@ -150,7 +150,7 @@ Model Cutting Now, consider how to cut some parts of the model off. This chapter describes the first convolution block ``InceptionV1/InceptionV1/Conv2d_1a_7x7`` of the Inception V1 model to illustrate cutting: -.. image:: ../../../../_static/images/inception_v1_first_block.svg +.. image:: ../../../../assets/images/inception_v1_first_block.svg :alt: Inception V1 first convolution block Cutting at the End diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst index 65af9f6322fb12..71c28a5db9205d 100644 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst +++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst @@ -77,7 +77,7 @@ Refer to the :doc:`Using Shape Inference <../../../../../../openvino-workflow/ru The second is that the frozen model still has two variables: ``previous_state_c`` and ``previous_state_h``, figure with the frozen *.pb model is below. It means that the model keeps training these variables at each inference. -.. image:: ./../../../../../../_static/images/DeepSpeech-0.8.2.png +.. image:: ../../../../../../assets/images/DeepSpeech-0.8.2.png At the first inference, the variables are initialized with zero tensors. After execution, the results of the ``BlockLSTM`` are assigned to cell state and hidden state, which are these two variables. diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst index 3d191b0859a2bf..3dc28444781b1a 100644 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst +++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst @@ -5,15 +5,15 @@ Converting TensorFlow FaceNet Models .. meta:: - :description: Learn how to convert a FaceNet model + :description: Learn how to convert a FaceNet model from TensorFlow to the OpenVINO Intermediate Representation. .. danger:: The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Supported Model Formats <../../../../../../openvino-workflow/model-preparation>` article. - + This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Supported Model Formats <../../../../../../openvino-workflow/model-preparation>` article. + `Public pre-trained FaceNet models `__ contain both training and inference part of graph. Switch between this two states is manageable with placeholder value. Intermediate Representation (IR) models are intended for inference, which means that train part is redundant. @@ -21,7 +21,7 @@ Intermediate Representation (IR) models are intended for inference, which means There are two inputs in this network: boolean ``phase_train`` which manages state of the graph (train/infer) and ``batch_size`` which is a part of batch joining pattern. -.. image:: ./../../../../../../_static/images/FaceNet.svg +.. image:: ../../../../../../assets/images/FaceNet.svg Converting a TensorFlow FaceNet Model to the IR ############################################### diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst index 5b0b97f42e78de..248d41f7eea4a7 100644 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst +++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst @@ -92,7 +92,7 @@ Once you have downloaded the pretrained model files, you will have the ``lm_1b`` -.. image:: ./../../../../../../_static/images/lm_1b.svg +.. image:: ../../../../../../assets/images/lm_1b.svg The frozen model still has two variables: ``Variable`` and ``Variable_1``. It means that the model keeps training those variables at each inference. diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst index 111549f2afd6b5..5a944288906b14 100644 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst +++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst @@ -43,7 +43,7 @@ This tutorial explains how to convert Neural Collaborative Filtering (NCF) model 3. Convert the model to the OpenVINO format. If you look at your frozen model, you can see that it has one input that is split into four ``ResourceGather`` layers. (Click image to zoom in.) - .. image:: ./../../../../../../_static/images/NCF_start.svg + .. image:: ../../../../../../assets/images/NCF_start.svg However, as the model conversion API does not support such data feeding, you should skip it. Cut the edges incoming in ``ResourceGather`` port 1: diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst index 18cc42f36ad6ec..66a8f4563bc9ef 100644 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst +++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst @@ -48,7 +48,7 @@ To fix some operators which prevent normal shape propagation: With ``->[0 -1]``, this new ``Parameter`` is replaced by a ``Constant`` operator which has the ``[0, -1]`` value. Since the ``Reshape`` operator has ``0`` and ``-1`` as specific values, it allows propagating shapes freely without losing the intended meaning of ``Reshape``. For more information, see :doc:`the specification <../../../openvino-ir-format/operation-sets/operation-specs/shape/reshape-1>`. - .. image:: ../../../../_static/images/batch_relaxation.png + .. image:: ../../../../assets/images/batch_relaxation.png * transform the model conversion on the back phase. For more information, see the :doc:`How to Convert a Model <../legacy-model-optimizer-extensibility>`, * transform OpenVINO Model during the runtime. For more information, see :doc:`OpenVINO Runtime Transformations <../../../openvino-extensibility/transformation-api>`, diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst index 965e6be70f4c80..1ecf2a55b94ec1 100644 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst +++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst @@ -83,7 +83,7 @@ Model Conversion Pipeline A model conversion pipeline can be represented with the following diagram: -.. image:: ../../../_static/images/MO_conversion_pipeline.svg +.. image:: ../../../assets/images/MO_conversion_pipeline.svg Each conversion step is reviewed in details below. @@ -100,7 +100,7 @@ is a separate loader for each supported framework. These loaders are implemented The result of a model loading step is a ``Graph`` object, which can be depicted like in the following example: -.. image:: ../../../_static/images/MO_graph_after_loader.svg +.. image:: ../../../assets/images/MO_graph_after_loader.svg Model Optimizer loader saves an operation instance framework description (usually it is a Protobuf message) into a node attribute usually with a name ``pb`` for each operation of an input model. It is important that this is a @@ -134,7 +134,7 @@ The extractors execution order is the following: The result of operations attributes extracting step can be depicted like in the following example: -.. image:: ../../../_static/images/MO_graph_after_extractors.svg +.. image:: ../../../assets/images/MO_graph_after_extractors.svg The only difference in the graph from the previous step is that nodes contain dictionary with extracted attributes and operation-specific attributes needed for Model Optimizer. However, from this step, Model Optimizer does not @@ -203,7 +203,7 @@ Model Optimizer does not have value propagation implementation for the operation Before running partial inference, the graph can be depicted like in the following example: -.. image:: ../../../_static/images/MO_graph_before_partial_inference.svg +.. image:: ../../../assets/images/MO_graph_before_partial_inference.svg The difference in a graph structure with a graph during the front phase is not only in the data nodes, but also in the edge attributes. Note that an ``out`` attribute is specified for edges **from operation** nodes only, while an ``in`` diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst index 1c8aa73b014cbd..b2be35f4452832 100644 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst +++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst @@ -95,7 +95,7 @@ port with ``idx = 2`` corresponds to the incoming edge of a node with an attribu Consider the example of a graph part with 4 operation nodes "Op1", "Op2", "Op3", and "Op4" and a number of data nodes depicted with light green boxes. -.. image:: ../../../../_static/images/MO_ports_example_1.svg +.. image:: ../../../../assets/images/MO_ports_example_1.svg :scale: 80 % :align: center @@ -132,7 +132,7 @@ For example, applying the following two methods to the graph above will result i op4.in_port(1).disconnect() op3.out_port(0).connect(op4.in_port(1)) -.. image:: ../../../../_static/images/MO_ports_example_2.svg +.. image:: ../../../../assets/images/MO_ports_example_2.svg :scale: 80 % :align: center @@ -165,7 +165,7 @@ example, the function call ``op3.out_port(0).get_connection().set_source(op1.out consuming data from port ``op3.out_port(0)`` to ``op1.out_port(0)``. The transformed graph from the sample above is depicted below: -.. image:: ../../../../_static/images/MO_connection_example_1.svg +.. image:: ../../../../assets/images/MO_connection_example_1.svg :scale: 80 % :align: center diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst index 39162e5c6fc78a..ea6f51aa61e227 100644 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst +++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst @@ -34,7 +34,7 @@ order. To execute the transformation during a proper model conversion phase, Mod anchor transformations that do nothing. All transformations are ordered with respect to these anchor transformations. The diagram below shows anchor transformations, some of built-in transformations and dependencies between them: -.. image:: ../../../../../_static/images/MO_transformations_graph.svg +.. image:: ../../../../../assets/images/MO_transformations_graph.svg User-defined transformations are executed after the corresponding ``Start`` and before the corresponding ``Finish`` anchor transformations by default (if ``run_before()`` and ``run_after()`` methods have not been overridden). diff --git a/docs/articles_en/documentation/openvino-ecosystem/datumaro.rst b/docs/articles_en/documentation/openvino-ecosystem/datumaro.rst index 2aa60cc18fb61a..c6b257ae3f17ca 100644 --- a/docs/articles_en/documentation/openvino-ecosystem/datumaro.rst +++ b/docs/articles_en/documentation/openvino-ecosystem/datumaro.rst @@ -21,7 +21,9 @@ Plus, enjoy `Jupyter notebooks `__ @@ -142,9 +142,9 @@ Begin this step on the Intel® Core™ or Xeon® processor machine that meets th 3. Install the Kernel-based Virtual Machine (KVM) and QEMU packages. - .. code-block:: sh + .. code-block:: sh - sudo apt install qemu qemu-kvm libvirt-bin bridge-utils virt-manager + sudo apt install qemu qemu-kvm libvirt-bin bridge-utils virt-manager 4. Check the QEMU version: @@ -288,16 +288,16 @@ This example in this step uses the following names. Your configuration might use 10. Create a script named ``virbr0-qemu-ifdown`` to bring down the ``virbr0`` interface. Add the following script contents: - .. code-block:: sh + .. code-block:: sh - #!/bin/sh - nic=$1 - if [ -f /etc/default/qemu-kvm ]; then - . /etc/default/qemu-kvm - fi - switch=virbr0 - brctl delif $switch $nic - ifconfig $nic 0.0.0.0 down + #!/bin/sh + nic=$1 + if [ -f /etc/default/qemu-kvm ]; then + . /etc/default/qemu-kvm + fi + switch=virbr0 + brctl delif $switch $nic + ifconfig $nic 0.0.0.0 down See the QEMU documentation for more information about the QEMU network configuration. @@ -390,43 +390,43 @@ As an option, you can use ``virsh`` and the virtual machine manager to create an 10. Start the vTPM on Host, write the HW TPM data into its NVRAM and restart the vTPM for QEMU: - .. code-block:: sh + .. code-block:: sh - sudo swtpm socket --tpm2 --server port=8280 \ - --ctrl type=tcp,port=8281 \ - --flags not-need-init --tpmstate dir=/var/OVSA/vtpm/vtpm_isv_dev & + sudo swtpm socket --tpm2 --server port=8280 \ + --ctrl type=tcp,port=8281 \ + --flags not-need-init --tpmstate dir=/var/OVSA/vtpm/vtpm_isv_dev & - sudo tpm2_startup --clear -T swtpm:port=8280 - sudo tpm2_startup -T swtpm:port=8280 - python3 /Scripts/host/OVSA_write_hwquote_swtpm_nvram.py 8280 - sudo pkill -f vtpm_isv_dev + sudo tpm2_startup --clear -T swtpm:port=8280 + sudo tpm2_startup -T swtpm:port=8280 + python3 /Scripts/host/OVSA_write_hwquote_swtpm_nvram.py 8280 + sudo pkill -f vtpm_isv_dev - swtpm socket --tpmstate dir=/var/OVSA/vtpm/vtpm_isv_dev \ - --tpm2 \ - --ctrl type=unixio,path=/var/OVSA/vtpm/vtpm_isv_dev/swtpm-sock \ - --log level=20 + swtpm socket --tpmstate dir=/var/OVSA/vtpm/vtpm_isv_dev \ + --tpm2 \ + --ctrl type=unixio,path=/var/OVSA/vtpm/vtpm_isv_dev/swtpm-sock \ + --log level=20 11. Start the Guest VM: - .. code-block:: sh - - sudo qemu-system-x86_64 \ - -cpu host \ - -enable-kvm \ - -m 8192 \ - -smp 8,sockets=1,cores=8,threads=1 \ - -device e1000,netdev=hostnet0,mac=52:54:00:d1:66:6f \ - -netdev tap,id=hostnet0,script=/br0-qemu-ifup,downscript=/br0-qemu-ifdown \ - -device e1000,netdev=hostnet1,mac=52:54:00:d1:66:5f \ - -netdev tap,id=hostnet1,script=/virbr0-qemu-ifup,downscript=/virbr0-qemu-ifdown \ - -drive if=virtio,file=/ovsa_isv_dev_vm_disk.qcow2,cache=none \ - -chardev socket,id=chrtpm,path=/var/OVSA/vtpm/vtpm_isv_dev/swtpm-sock \ - -tpmdev emulator,id=tpm0,chardev=chrtpm \ - -device tpm-tis,tpmdev=tpm0 \ - -vnc :1 + .. code-block:: sh - Use the QEMU runtime options in the command to change the memory amount or CPU assigned to this Guest VM. + sudo qemu-system-x86_64 \ + -cpu host \ + -enable-kvm \ + -m 8192 \ + -smp 8,sockets=1,cores=8,threads=1 \ + -device e1000,netdev=hostnet0,mac=52:54:00:d1:66:6f \ + -netdev tap,id=hostnet0,script=/br0-qemu-ifup,downscript=/br0-qemu-ifdown \ + -device e1000,netdev=hostnet1,mac=52:54:00:d1:66:5f \ + -netdev tap,id=hostnet1,script=/virbr0-qemu-ifup,downscript=/virbr0-qemu-ifdown \ + -drive if=virtio,file=/ovsa_isv_dev_vm_disk.qcow2,cache=none \ + -chardev socket,id=chrtpm,path=/var/OVSA/vtpm/vtpm_isv_dev/swtpm-sock \ + -tpmdev emulator,id=tpm0,chardev=chrtpm \ + -device tpm-tis,tpmdev=tpm0 \ + -vnc :1 + + Use the QEMU runtime options in the command to change the memory amount or CPU assigned to this Guest VM. 12. Use a VNC client to log on to the Guest VM at ``:1`` @@ -701,9 +701,9 @@ The Model Hosting components install the OpenVINO™ Security Add-on Runtime Doc 1. Log on to the Guest VM as ````. 2. Create the OpenVINO™ Security Add-on directory in the home directory - .. code-block:: sh + .. code-block:: sh - mkdir -p ~/OVSA + mkdir -p ~/OVSA 3. While on the Host Machine copy the ovsa-model-hosting.tar.gz from release_files to the Guest VM: @@ -744,7 +744,7 @@ The following figure describes the interactions between the Model Developer, Ind The Model Developer/Independent Software Vendor and User roles are related to virtual machine use and one person might fill the tasks required by multiple roles. In this document the tasks of Model Developer and Independent Software Vendor are combined and use the Guest VM named ``ovsa_isv``. It is possible to have all roles set up on the same Host Machine. -.. image:: ../../_static/images/ovsa_example.svg +.. image:: ../../assets/images/ovsa_example.svg Model Developer Instructions ++++++++++++++++++++++++++++ @@ -770,7 +770,8 @@ Step 2: Create a key store and add a certificate to it ------------------------------------------------------ 1. Create files to request a certificate: -This example uses a self-signed certificate for demonstration purposes. In a production environment, use CSR files to request for a CA-signed certificate. + + This example uses a self-signed certificate for demonstration purposes. In a production environment, use CSR files to request for a CA-signed certificate. .. code-block:: sh @@ -869,8 +870,8 @@ Step 7: Receive a User Request 5. Provide these files to the User: - * ``face_detection_model.dat`` - * ``face_detection_model.lic`` + * ``face_detection_model.dat`` + * ``face_detection_model.lic`` Model User Instructions +++++++++++++++++++++++ @@ -988,9 +989,9 @@ Step 5: Start the NGINX Model Server The NGINX Model Server publishes the access controlled model. - .. code-block:: sh +.. code-block:: sh - ./start_secure_ovsa_model_server.sh + ./start_secure_ovsa_model_server.sh For information about the NGINX interface follow `here `__. @@ -1051,6 +1052,7 @@ References ########## Use these links for more information: + - `OpenVINO toolkit `__ - `OpenVINO Model Server Quick Start Guide `__ - `Model repository `__ diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst index 76bb29e7925a32..c261c6dd06ce0a 100644 --- a/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst +++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst @@ -18,7 +18,7 @@ inference. It allows you to export and convert the models to the needed format. Detailed Workflow ################# -.. image:: ./../../_static/images/training_extensions_framework.png +.. image:: ../../assets/images/training_extensions_framework.png 1. To start working with OpenVINO Training Extensions, prepare and annotate your dataset. For example, on CVAT. diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst index df0edd1f038135..c8e041e5a367e9 100644 --- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst +++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst @@ -79,7 +79,7 @@ If operation is not supported by LPT then dequantization operation will not be p For example, if you would like to infer a model with ``Convolution`` operation in low precision then the model can look as on picture below: -.. image:: ../../../../_static/images/model_fq_and_convolution.common.svg +.. image:: ../../../../assets/images/model_fq_and_convolution.common.svg :alt: Quantized Convolution There are several supported quantization approaches on activations and on weights. All supported approaches are described in `Quantization approaches <#quantization-approaches>`__ section below. In demonstrated model `FakeQuantize operation quantization <#fakequantize-operation>`__ approach is used. @@ -104,7 +104,7 @@ FakeQuantize operation In this case ``FakeQuantize`` operation is used on activations and quantized constant on weights. Original input model: -.. image:: ../../../../_static/images/model_fq_and_convolution.common.svg +.. image:: ../../../../assets/images/model_fq_and_convolution.common.svg :alt: Original model with FakeQuantize @@ -113,7 +113,7 @@ Quantize and dequantization operations In this case ``FakeQuantize`` operation and ``Convert`` are used as quantize operation and return quantized low precision tensor. After quantize operation on activations there are ``Convert`` and dequantization operations to compensate decomposition. Original input model: -.. image:: ../../../../_static/images/model_qdq_and_convolution.common.svg +.. image:: ../../../../assets/images/model_qdq_and_convolution.common.svg :alt: Original model with Q/DQ In both cases result is the same. In LPT result model you can see that: @@ -129,7 +129,7 @@ In both cases result is the same. In LPT result model you can see that: LPT result model: -.. image:: ../../../../_static/images/model_fq_and_convolution.transformed.svg +.. image:: ../../../../assets/images/model_fq_and_convolution.transformed.svg :alt: Result model Low precision transformations pipeline @@ -137,7 +137,7 @@ Low precision transformations pipeline LPT transformation pipeline has several steps. For each transformation inside one step pattern matcher is unique per transformation, but each operation can be assigned to several transformations. -.. image:: ../../../../_static/images/low_precision_transformation_pipeline.svg +.. image:: ../../../../assets/images/low_precision_transformation_pipeline.svg :alt: Low precision transformations pipeline Inside each step LPT transformations handle input model operation by operation, applying transformation matching pattern for each transformation from the step to an operation, and execute transformation if pattern is matched. Decomposition transformation decomposes ``FakeQuantize`` to quantize and dequantization operations. Dequantization operations from previous transformation result is used for the current one and so on, until the end of the model is achieved. @@ -227,12 +227,12 @@ Decomposition transformations decompose the ``FakeQuantize`` operation to: quant Original ``FakeQuantize``: -.. image:: ../../../../_static/images/fq.common.svg +.. image:: ../../../../assets/images/fq.common.svg :alt: FakeQuantize operation before LPT ``FakeQuantize`` after decomposition to quantization and dequantization operations: -.. image:: ../../../../_static/images/fq.transformed.svg +.. image:: ../../../../assets/images/fq.transformed.svg :alt: FakeQuantize operation after LPT Dequantization operations handling transformations @@ -242,12 +242,12 @@ In this step, LPT transformations fuse dequantization operations or move them th Original ``Convolution`` operation in FP32 with dequantization operations before: -.. image:: ../../../../_static/images/model_fq_and_convolution.common.svg +.. image:: ../../../../assets/images/model_fq_and_convolution.common.svg :alt: Convolution operation before LPT ``Convolution`` operation in INT8 after decomposition and dequantization operations handling: -.. image:: ../../../../_static/images/model_fq_and_convolution.transformed.svg +.. image:: ../../../../assets/images/model_fq_and_convolution.transformed.svg :alt: Convolution operation after LPT @@ -270,12 +270,12 @@ There are more details in developer guide :doc:`Cleanup transformations `. Interpreting FakeQuantize at runtime @@ -27,45 +27,45 @@ During the model load each plugin can interpret quantization rules expressed in * Independently based on the definition of *FakeQuantize* operation. * Using a special library of low-precision transformations (LPT) which applies common rules for generic operations, such as Convolution, Fully-Connected, Eltwise, etc., and translates "fake-quantized" models into models with low-precision operations. -Here we provide only a high-level overview of the interpretation rules of FakeQuantize. -At runtime each FakeQuantize can be split into two independent operations: **Quantize** and **Dequantize**. -The former one is aimed to transform the input data into the target precision while the latter transforms the resulting values back to the original range and precision. -In practice *Dequantize* operations can be propagated forward through the linear operations, such as *Convolution* or *Fully-Connected*, +Here we provide only a high-level overview of the interpretation rules of FakeQuantize. +At runtime each FakeQuantize can be split into two independent operations: **Quantize** and **Dequantize**. +The former one is aimed to transform the input data into the target precision while the latter transforms the resulting values back to the original range and precision. +In practice *Dequantize* operations can be propagated forward through the linear operations, such as *Convolution* or *Fully-Connected*, and in some cases fused with the following *Quantize* operation for the next layer into the so-called *Requantize* operation (see Fig. 1). -.. image:: ../../../../_static/images/qdq_propagation.png +.. image:: ../../../../assets/images/qdq_propagation.png Figure 1. Quantization operations propagation at runtime. Q, DQ, RQ stand for Quantize, Dequantize, and Requantize correspondingly. -From the calculation standpoint, the FakeQuantize formula also is split into two parts accordingly: +From the calculation standpoint, the FakeQuantize formula also is split into two parts accordingly: ``output = round((x - input_low) / (input_high - input_low) * (levels-1)) / (levels-1) * (output_high - output_low) + output_low`` -The first part of this formula represents *Quantize* operation: +The first part of this formula represents *Quantize* operation: -``q = round((x - input_low) / (input_high - input_low) * (levels-1))`` +``q = round((x - input_low) / (input_high - input_low) * (levels-1))`` -The second is responsible for the dequantization: +The second is responsible for the dequantization: -``r = q / (levels-1) * (output_high - output_low) + output_low`` +``r = q / (levels-1) * (output_high - output_low) + output_low`` -From the scale/zero-point notation standpoint the latter formula can be written as follows: +From the scale/zero-point notation standpoint the latter formula can be written as follows: -``r = (output_high - output_low) / (levels-1) * (q + output_low / (output_high - output_low) * (levels-1))`` +``r = (output_high - output_low) / (levels-1) * (q + output_low / (output_high - output_low) * (levels-1))`` Thus we can define: * **Scale** as ``(output_high - output_low) / (levels-1)`` * **Zero-point** as ``-output_low / (output_high - output_low) * (levels-1)`` -.. note:: +.. note:: During the quantization process the values ``input_low``, ``input_high``, ``output_low``, ``output_high`` are selected so that to map a floating-point zero exactly to an integer value (zero-point) and vice versa. Quantization specifics and restrictions ####################################### In general, OpenVINO can represent and execute quantized models from different sources. However, the Neural Network Compression Framework (NNCF) -is considered the default way to get optimized models. Since the NNCF supports HW-aware quantization it means that specific rules can be implemented in it for +is considered the default way to get optimized models. Since the NNCF supports HW-aware quantization it means that specific rules can be implemented in it for the particular HW. However, it is reasonable to have compatibility with general-purpose HW such as CPU and GPU and support their quantization schemes. Below we define these rules as follows: @@ -73,6 +73,6 @@ Below we define these rules as follows: * Per-channel quantization of weights of Convolutional and Fully-Connected layers. * Per-channel quantization of activations for channel-wise and element-wise operations, e.g. Depthwise Convolution, Eltwise Add/Mul, ScaleShift. * Symmetric and asymmetric quantization of weights and activations with the support of per-channel scales and zero-points. -* Non-unified quantization parameters for Eltwise and Concat operations. +* Non-unified quantization parameters for Eltwise and Concat operations. * Non-quantized network output, i.e. there are no quantization parameters for it. diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models/low-precision-model-representation.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models/low-precision-model-representation.rst index 2435aebd6a4242..abecc2cfa8f580 100644 --- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models/low-precision-model-representation.rst +++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/quantized-models/low-precision-model-representation.rst @@ -24,7 +24,7 @@ In order to be able to execute a particular DL operation in low-precision all it between operation and data blobs. The figure below shows an example of quantized Convolution which contains two FakeQuantize nodes: one for weights and one for activations (bias is quantized using the same parameters). -.. image:: ../../../../../_static/images/IE_PLUGIN_DG/images/quantized_convolution.png +.. image:: ../../../../../assets/images/quantized_convolution.png Starting from OpenVINO 2020.2 release all the quantized models are represented in the compressed form. It means that the weights @@ -32,4 +32,4 @@ of low-precision operations are converted into the target precision (e.g. INT8). The rest of the parameters can be represented in FLOAT32 or FLOAT16 precision depending on the input full-precision model used in the quantization process. Fig. 2 below shows an example of the part of the compressed IR. -.. image:: ../../../../../_static/images/IE_PLUGIN_DG/images/quantized_model_example.png +.. image:: ../../../../../assets/images/quantized_model_example.png diff --git a/docs/articles_en/documentation/openvino-extensibility/transformation-api.rst b/docs/articles_en/documentation/openvino-extensibility/transformation-api.rst index 2f6ea47c4441cc..5e28a22e69ab98 100644 --- a/docs/articles_en/documentation/openvino-extensibility/transformation-api.rst +++ b/docs/articles_en/documentation/openvino-extensibility/transformation-api.rst @@ -49,7 +49,7 @@ Let's start with OpenVINO™ helper functions. The most popular function is ``ov We will review real replacement case where Negative operation is replaced with Multiply. -.. image:: ./../../_static/images/ov_replace_node.png +.. image:: ../../assets/images/ov_replace_node.png .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp :language: cpp @@ -65,7 +65,7 @@ The alternative way to do the same replacement is the following: Another transformation example is insertion. -.. image:: ./../../_static/images/ov_insert_node.png +.. image:: ../../assets/images/ov_insert_node.png .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp :language: cpp @@ -101,7 +101,7 @@ OpenVINO™ Runtime has three main transformation types: * :doc:`Matcher pass ` - pattern-based transformation approach * :doc:`Graph rewrite pass ` - container for matcher passes needed for efficient execution -.. image:: ./../../_static/images/transformations_structure.png +.. image:: ../../assets/images/transformations_structure.png Transformation conditional compilation ###################################### diff --git a/docs/articles_en/documentation/openvino-extensibility/transformation-api/graph-rewrite-pass.rst b/docs/articles_en/documentation/openvino-extensibility/transformation-api/graph-rewrite-pass.rst index 82057f8cb153d2..e2bc1f8d4bbf00 100644 --- a/docs/articles_en/documentation/openvino-extensibility/transformation-api/graph-rewrite-pass.rst +++ b/docs/articles_en/documentation/openvino-extensibility/transformation-api/graph-rewrite-pass.rst @@ -24,12 +24,12 @@ In addition, GraphRewrite handles nodes that were registered by MatcherPasses du GraphRewrite has two algorithms for MatcherPasses execution. First algorithm is straightforward. It applies each MatcherPass in registration order to current node. -.. image:: ./../../../_static/images/graph_rewrite_execution.png +.. image:: ../../../assets/images/graph_rewrite_execution.png But it is not really efficient when you have a lot of registered passes. So first of all GraphRewrite checks that all MatcherPass patterns has type-based root node (it means that type of this node is not hidden into predicate). And then creates map from registered MatcherPasses. That helps to avoid additional cost of applying each MatcherPass for each node. -.. image:: ./../../../_static/images/graph_rewrite_efficient_search.png +.. image:: ../../../assets/images/graph_rewrite_efficient_search.png .. note:: diff --git a/docs/articles_en/documentation/openvino-ir-format/intermediate-representation-int8-inference.rst b/docs/articles_en/documentation/openvino-ir-format/intermediate-representation-int8-inference.rst index d9557f98827aa0..a0cc4488ef15cc 100644 --- a/docs/articles_en/documentation/openvino-ir-format/intermediate-representation-int8-inference.rst +++ b/docs/articles_en/documentation/openvino-ir-format/intermediate-representation-int8-inference.rst @@ -28,7 +28,7 @@ For more details, see the :doc:`specification of FakeQuantize operation ` * :doc:`RNNSequence <../operation-specs/sequence/rnn-sequence-5>` * :doc:`ROIAlign <../operation-specs/detection/roi-align-9>` -* :doc:`ROIAlignRotated <../operation-specs/detection/roi-align-rotated-14>` * :doc:`ROIPooling <../operation-specs/detection/roi-pooling-1>` * :doc:`Roll <../operation-specs/movement/roll-7>` * :doc:`Round <../operation-specs/arithmetic/round-5>` diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst index a39de0b72d5a8e..250ef955bb41a8 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst @@ -188,7 +188,6 @@ Operation Specifications RNNSequence-5 ROIAlign-3 ROIAlign-9 - ROIAlignRotated-14 ROIPooling-1 Roll-7 Round-5 diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-14.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-15.rst similarity index 95% rename from docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-14.rst rename to docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-15.rst index 7ec8acdd2238b6..1da1e33079c106 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-14.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/detection/roi-align-rotated-15.rst @@ -1,14 +1,14 @@ -.. {#openvino_docs_ops_detection_ROIAlignRotated_14} +.. {#openvino_docs_ops_detection_ROIAlignRotated_15} ROIAlignRotated =============== .. meta:: - :description: Learn about ROIAlignRotated-14 - an object detection operation, + :description: Learn about ROIAlignRotated-15 - an object detection operation, which can be performed on three required input tensors. -**Versioned name**: *ROIAlignRotated-14* +**Versioned name**: *ROIAlignRotated-15* **Category**: *Object detection* @@ -56,7 +56,7 @@ Each ROI box's center is shifted by [-0.5, -0.5] before pooling to achive better * *spatial_scale* * **Description**: *spatial_scale* is a multiplicative spatial scale factor to that is applied to the ROI box(height, weight and center vector) before pooling. - WARNING! + WARNING! Spatial scale is also applied to the center point of the ROI box. It means that scaling does not only change the size of the ROI box, but also its position. For example, if the spatial scale is 2.0, ROI box center is [0.5, 0.5], box width is 1.0 and box height is 1.0, then after scaling the ROI box center will be [1.0, 1.0], box width will be 2.0 and box height will be 2.0. * **Range of values**: a positive floating-point number @@ -67,7 +67,7 @@ Each ROI box's center is shifted by [-0.5, -0.5] before pooling to achive better * **Description**: If True, the angle for each ROI represents a clockwise rotation, otherwise - counterclockwise rotation. * **Type**: ``bool`` - * **Default value**: False + * **Default value**: False * **Required**: *no* **Inputs**: diff --git a/docs/articles_en/documentation/openvino-security/openvino-encrypted-models.rst b/docs/articles_en/documentation/openvino-security/openvino-encrypted-models.rst index 7a18392d2df38a..1a5d7261440c5e 100644 --- a/docs/articles_en/documentation/openvino-security/openvino-encrypted-models.rst +++ b/docs/articles_en/documentation/openvino-security/openvino-encrypted-models.rst @@ -22,7 +22,7 @@ After a model is optimized by model conversion API, it's deployed to target devi Encrypting and optimizing model before deploying it to the edge device can be used to protect deep-learning models. The edge device should keep the stored model protected all the time and have the model decrypted **in runtime only** for use by the OpenVINO Runtime. -.. image:: ../../_static/images/deploy_encrypted_model.svg +.. image:: ../../assets/images/deploy_encrypted_model.svg Loading Encrypted Models ######################## diff --git a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst index e6d8b3a4170d04..ba5dd9dec91e65 100644 --- a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst +++ b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst @@ -87,13 +87,13 @@ To check if the driver has been installed: 1. Type **device manager** in the **Search Windows** field and press Enter. **Device Manager** will open. 2. Click the drop-down arrow to display **Display Adapters**. You can see the adapter that is installed in your computer: - .. image:: ../../_static/images/DeviceManager.PNG + .. image:: ../../assets/images/DeviceManager.PNG :width: 400 3. Right-click on the adapter name and select **Properties**. 4. Click the **Driver** tab to view the driver version. - .. image:: ../../_static/images/DeviceDriverVersion.svg + .. image:: ../../assets/images/DeviceDriverVersion.svg :width: 400 Your device driver has been updated and is now ready to use your GPU. diff --git a/docs/articles_en/learn-openvino/interactive-tutorials-python.rst b/docs/articles_en/learn-openvino/interactive-tutorials-python.rst index 98719478526bb9..d1215627a4d381 100644 --- a/docs/articles_en/learn-openvino/interactive-tutorials-python.rst +++ b/docs/articles_en/learn-openvino/interactive-tutorials-python.rst @@ -48,10 +48,10 @@ Additional Resources * `Google Colab `__ -.. |binder logo| image:: ../_static/images/launch_in_binder.svg +.. |binder logo| image:: ../assets/images/launch_in_binder.svg :class: notebook-badge-p :alt: Binder button -.. |colab logo| image:: ../_static/images/open_in_colab.svg +.. |colab logo| image:: ../assets/images/open_in_colab.svg :class: notebook-badge-p :alt: Google Colab button diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst b/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst index e7e59ba5755ec0..571743701ce01a 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst @@ -8,7 +8,7 @@ Tokenizers convert the input text into a sequence of tokens with corresponding I the model can understand and process it during inference. The transformation of a sequence of numbers into a string is called detokenization. -.. image:: ../../_static/images/tokenization.svg +.. image:: ../../assets/images/tokenization.svg :align: center There are two important points in the tokenizer-model relation: diff --git a/docs/articles_en/learn-openvino/openvino-samples/get-started-demos.rst b/docs/articles_en/learn-openvino/openvino-samples/get-started-demos.rst index 9d0ff76275661e..a0137b0ee25d8f 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/get-started-demos.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/get-started-demos.rst @@ -472,7 +472,7 @@ The following command shows how to run the Image Classification Code Sample usin When the sample application is complete, you are given the label and confidence for the top 10 categories. The input image and sample output of the inference results is shown below: -.. image:: ../../_static/images/dog.png +.. image:: ../../assets/images/dog.png .. code-block:: sh diff --git a/docs/articles_en/openvino-workflow.rst b/docs/articles_en/openvino-workflow.rst index b9a7162f4786a7..9c984b674a28da 100644 --- a/docs/articles_en/openvino-workflow.rst +++ b/docs/articles_en/openvino-workflow.rst @@ -6,8 +6,8 @@ OpenVINO Workflow .. meta:: - :description: OpenVINO toolkit workflow usually involves preparation, - optimization, and compression of models, running inference and + :description: OpenVINO toolkit workflow usually involves preparation, + optimization, and compression of models, running inference and deploying deep learning applications. .. toctree:: @@ -20,7 +20,7 @@ OpenVINO Workflow Deployment on a Local System Deployment on a Model Server openvino-workflow/torch-compile - + OpenVINO offers multiple workflows, depending on the use case and personal or project preferences. This section will give you a detailed view of how you can go from preparing your model, @@ -33,18 +33,18 @@ you can decide how to proceed: .. tab-item:: Workflow for convenience - This approach assumes you run your model directly. + This approach assumes you run your model directly. - .. image:: _static/images/ov_workflow_diagram_convenience.svg + .. image:: ./assets/images/ov_workflow_diagram_convenience.svg :align: center :alt: OpenVINO workflow diagram for convenience .. tab-item:: Workflow for performance (recommended for production) This approach assumes you convert your model to OpenVINO IR explicitly, which means the - conversion stage is not part of the final application. + conversion stage is not part of the final application. - .. image:: _static/images/ov_workflow_diagram_performance.svg + .. image:: ./assets/images/ov_workflow_diagram_performance.svg :align: center :alt: OpenVINO workflow diagram for performance @@ -74,7 +74,7 @@ OpenVINO uses the following functions for reading, converting, and saving models .. tab-item:: save_model * Saves an ov.Model to OpenVINO IR format. - * Compresses weights to FP16 by default. + * Compresses weights to FP16 by default. * This method is only available in the Python API. @@ -82,14 +82,14 @@ OpenVINO uses the following functions for reading, converting, and saving models | Learn how to convert pre-trained models to OpenVINO IR. | :doc:`Model Optimization and Compression ` -| Find out how to optimize a model to achieve better inference performance, utilizing - multiple optimization methods for both in-training compression and post-training quantization. +| Find out how to optimize a model to achieve better inference performance, utilizing + multiple optimization methods for both in-training compression and post-training quantization. | :doc:`Running Inference ` -| See how to run inference with OpenVINO, which is the most basic form of deployment, +| See how to run inference with OpenVINO, which is the most basic form of deployment, and the quickest way of running a deep learning model. -| :doc:`Deployment Option 1. Using OpenVINO Runtime ` +| :doc:`Deployment Option 1. Using OpenVINO Runtime ` | Deploy a model locally, reading the file directly from your application and utilizing about-openvino/additional-resources available to the system. | Deployment on a local system uses the steps described in the section on running inference. diff --git a/docs/articles_en/openvino-workflow/deployment-locally.rst b/docs/articles_en/openvino-workflow/deployment-locally.rst index 6bdbece9dd119a..657c1f2ce63d4e 100644 --- a/docs/articles_en/openvino-workflow/deployment-locally.rst +++ b/docs/articles_en/openvino-workflow/deployment-locally.rst @@ -61,7 +61,7 @@ Granularity of Major Distribution Types The granularity of OpenVINO packages may vary for different distribution types. For example, the PyPI distribution of OpenVINO has a `single 'openvino' package `__ that contains all the runtime libraries and plugins, while a :doc:`local distribution ` is a more configurable type providing higher granularity. Below are important details of the set of libraries included in the OpenVINO Runtime package: -.. image:: ../_static/images/deployment_simplified.svg +.. image:: ../assets/images/deployment_simplified.svg - The main library ``openvino`` is used by users' C++ applications to link against with. For C language applications, ``openvino_c`` is additionally required for distribution. The library includes OpenVINO API 2.0. diff --git a/docs/articles_en/openvino-workflow/deployment-locally/local-distribution-libraries.rst b/docs/articles_en/openvino-workflow/deployment-locally/local-distribution-libraries.rst index ca471628f0ee23..9889f15c0ecbd9 100644 --- a/docs/articles_en/openvino-workflow/deployment-locally/local-distribution-libraries.rst +++ b/docs/articles_en/openvino-workflow/deployment-locally/local-distribution-libraries.rst @@ -35,7 +35,7 @@ Libraries for Pluggable Components The picture below presents dependencies between the OpenVINO Runtime core and pluggable libraries: -.. image:: ../../_static/images/deployment_full.svg +.. image:: ../../assets/images/deployment_full.svg Libraries for Compute Devices +++++++++++++++++++++++++++++ diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst index c1796c87113ca1..ec08b12894e1aa 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst @@ -48,11 +48,6 @@ To install the latest released version via pip manager run the following command pip install nncf -.. note:: - - To install with specific frameworks, use the `pip install nncf[extras]` command, where extras is a list of possible extras, for example, `torch`, `tf`, `onnx`. - - To install the latest NNCF version from source, follow the instruction on `GitHub `__. .. note:: @@ -64,7 +59,7 @@ Working with NNCF The figure below shows a common workflow of applying training-time compressions with NNCF. The NNCF optimizations are added to the TensorFlow or PyTorch training script, and then the model undergoes fine-tuning. The optimized model can then be exported to OpenVINO IR format for accelerated performance with OpenVINO Runtime. -.. image:: ../../_static/images/nncf_workflow.svg +.. image:: ../../assets/images/nncf_workflow.svg Training-Time Compression Methods diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-pytorch.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-pytorch.rst new file mode 100644 index 00000000000000..91b405d43e92b3 --- /dev/null +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-pytorch.rst @@ -0,0 +1,72 @@ +Quantization-aware Training (QAT) with PyTorch +=============================================== + +Below are the steps required to integrate QAT from NNCF into a training script written with +PyTorch: + + +1. Apply Post Training Quantization to the Model +################################################## + +Quantize the model using the :doc:`Post-Training Quantization <../quantizing-models-post-training/basic-quantization-flow>` method. + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [quantize] + + +2. Fine-tune the Model +######################## + +This step assumes applying fine-tuning to the model the same way it is done for the baseline model. For QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5. +Quantized models perform all computations in floating-point precision during fine-tuning by modeling quantization errors in both forward and backward passes. + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [tune_model] + + +.. note:: + The precision of weights transitions to INT8 only after converting the model to OpenVINO Intermediate Representation. + You can expect a reduction in model footprint only for that format. + + +These steps outline the basics of applying the QAT method from the NNCF. However, in some cases, it is required to save/load model +checkpoints during training. Since NNCF wraps the original model with its own object, it provides an API for these needs. + +3. (Optional) Save Checkpoint +#################################### + +To save a model checkpoint, use the following API: + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [save_checkpoint] + + +4. (Optional) Restore from Checkpoint +################################################ + +To restore the model from checkpoint, use the following API: + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [load_checkpoint] + + +Deploying the Quantized Model +############################### + +The model can be converted into the OpenVINO Intermediate Representation (IR) if needed, compiled, and run with OpenVINO without any additional steps. + +.. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_torch.py + :language: python + :fragment: [inference] + +For more details, see the corresponding :doc:`documentation <../../running-inference>`. + +Examples +#################### + +* `Quantization-aware Training of Resnet18 PyTorch Model `__ +* `Quantization-aware Training of STFPM PyTorch Model `__ diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-tensorflow.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-tensorflow.rst new file mode 100644 index 00000000000000..41a2ea615214a8 --- /dev/null +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training-tensorflow.rst @@ -0,0 +1,112 @@ +Quantization-aware Training (QAT) with TensorFlow +=================================================== + +Below are the steps required to integrate QAT from NNCF into a training script written with TensorFlow: + +.. note:: + Currently, NNCF for TensorFlow supports optimization of the models created using Keras + `Sequential API `__ or + `Functional API `__. + +1. Import NNCF API +######################## + +Add NNCF-related imports in the beginning of the training script: + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [imports] + +2. Create NNCF Configuration +#################################### + +Define NNCF configuration which consists of model-related parameters (the ``"input_info"`` section) and parameters +of optimization methods (the ``"compression"`` section). For faster convergence, it is also recommended to register a dataset object +specific to the DL framework. The data object will be used at the model creation step to initialize quantization parameters. + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [nncf_congig] + + +3. Apply Optimization Methods +#################################### + +Wrap the original model object with the ``create_compressed_model()`` API using the configuration +defined in the previous step. This method returns a so-called compression controller and a wrapped model that can be used the +same way as the original model. Optimization methods are applied at this step, so that the model +undergoes a set of corresponding transformations and contains additional operations required for optimization. In case of QAT, the compression controller object is used for model export and, optionally, in distributed training as demonstrated below. + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [wrap_model] + + +4. Fine-tune the Model +#################################### + +This step assumes applying fine-tuning to the model the same way it is done for the baseline model. For QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5. In principle, +you can skip this step, meaning that the post-training optimization will be applied to the model. + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [tune_model] + + +5. Multi-GPU Distributed Training +#################################### + +In the case of distributed multi-GPU training (not DataParallel), call ``compression_ctrl.distributed()`` before fine-tuning. This informs optimization methods to make adjustments to function in the distributed mode. + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [distributed] + + +.. note:: + The precision of weights transitions to INT8 only after converting the model to OpenVINO Intermediate Representation. + You can expect a reduction in model footprint only for that format. + + +These steps outline the basics of applying the QAT method from the NNCF. However, in some cases, it is required to save/load model +checkpoints during training. Since NNCF wraps the original model with its own object, it provides an API for these needs. + +6. (Optional) Save Checkpoint +#################################### + +To save a model checkpoint, use the following API: + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [save_checkpoint] + + +7. (Optional) Restore from Checkpoint +################################################ + +To restore the model from checkpoint, use the following API: + +.. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [load_checkpoint] + + +For more details on saving/loading checkpoints in the NNCF, see the corresponding `NNCF documentation `__. + +Deploying quantized model +######################### + +The model can be converted into the OpenVINO Intermediate Representation (IR) if needed, compiled and run with OpenVINO. +No extra steps or options are required. + +.. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py + :language: python + :fragment: [inference] + +For more details, see the corresponding :doc:`documentation <../../running-inference>`. + +Examples +#################### + +* `Quantizing TensorFlow model with NNCF `__ + diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training.rst index cce63315939aaf..f5ec455a7e2e15 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/quantization-aware-training.rst @@ -1,8 +1,12 @@ -.. {#qat_introduction} - Quantization-aware Training (QAT) ================================= +.. toctree:: + :maxdepth: 1 + :hidden: + + Quantization-aware Training with PyTorch + Quantization-aware Training with TensorFlow Introduction #################### @@ -12,223 +16,6 @@ degradation caused by quantization. In fact, this is the most accurate quantizat apply QAT from the Neural Network Compression Framework (NNCF) to get 8-bit quantized models. This assumes that you are knowledgeable in Python programming and familiar with the training code for the model in the source DL framework. -Using NNCF QAT -#################### - -Here, we provide the steps that are required to integrate QAT from NNCF into the training script written with -PyTorch or TensorFlow 2: - -.. note:: - Currently, NNCF for TensorFlow 2 supports optimization of the models created using Keras - `Sequential API `__ or - `Functional API `__. - -1. Import NNCF API -++++++++++++++++++++ - -In this step, you add NNCF-related imports in the beginning of the training script: - -.. tab-set:: - - .. tab-item:: PyTorch - :sync: pytorch - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py - :language: python - :fragment: [imports] - - .. tab-item:: TensorFlow 2 - :sync: tensorflow-2 - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py - :language: python - :fragment: [imports] - -2. Create NNCF configuration -++++++++++++++++++++++++++++ - -Here, you should define NNCF configuration which consists of model-related parameters (``"input_info"`` section) and parameters -of optimization methods (``"compression"`` section). For faster convergence, it is also recommended to register a dataset object -specific to the DL framework. It will be used at the model creation step to initialize quantization parameters. - -.. tab-set:: - - .. tab-item:: PyTorch - :sync: pytorch - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py - :language: python - :fragment: [nncf_congig] - - .. tab-item:: TensorFlow 2 - :sync: tensorflow-2 - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py - :language: python - :fragment: [nncf_congig] - - -3. Apply optimization methods -+++++++++++++++++++++++++++++ - -In the next step, you need to wrap the original model object with the ``create_compressed_model()`` API using the configuration -defined in the previous step. This method returns a so-called compression controller and a wrapped model that can be used the -same way as the original model. It is worth noting that optimization methods are applied at this step so that the model -undergoes a set of corresponding transformations and can contain additional operations required for the optimization. In -the case of QAT, the compression controller object is used for model export and, optionally, in distributed training as it -will be shown below. - -.. tab-set:: - - .. tab-item:: PyTorch - :sync: pytorch - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py - :language: python - :fragment: [wrap_model] - - .. tab-item:: TensorFlow 2 - :sync: tensorflow-2 - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py - :language: python - :fragment: [wrap_model] - - -4. Fine-tune the model -++++++++++++++++++++++ - -This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the -case of QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5. In principle, -you can skip this step which means that the post-training optimization will be applied to the model. - -.. tab-set:: - - .. tab-item:: PyTorch - :sync: pytorch - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py - :language: python - :fragment: [tune_model] - - .. tab-item:: TensorFlow 2 - :sync: tensorflow-2 - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py - :language: python - :fragment: [tune_model] - - - -5. Multi-GPU distributed training -+++++++++++++++++++++++++++++++++ - -In the case of distributed multi-GPU training (not DataParallel), you should call ``compression_ctrl.distributed()`` before -the fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode. - -.. tab-set:: - - .. tab-item:: PyTorch - :sync: pytorch - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py - :language: python - :fragment: [distributed] - - .. tab-item:: TensorFlow 2 - :sync: tensorflow-2 - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py - :language: python - :fragment: [distributed] - -6. Export quantized model -+++++++++++++++++++++++++ - -When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in -the case of PyTorch and frozen graph - for TensorFlow 2. - -.. tab-set:: - - .. tab-item:: PyTorch - :sync: pytorch - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py - :language: python - :fragment: [export] - - .. tab-item:: TensorFlow 2 - :sync: tensorflow-2 - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py - :language: python - :fragment: [export] - - -.. note:: - The precision of weights gets INT8 only after the step of model conversion to OpenVINO Intermediate Representation. - You can expect the model footprint reduction only for that format. - - -These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model -checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs. - -7. (Optional) Save checkpoint -+++++++++++++++++++++++++++++ - -To save model checkpoint use the following API: - -.. tab-set:: - - .. tab-item:: PyTorch - :sync: pytorch - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py - :language: python - :fragment: [save_checkpoint] - - .. tab-item:: TensorFlow 2 - :sync: tensorflow-2 - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py - :language: python - :fragment: [save_checkpoint] - - -8. (Optional) Restore from checkpoint -+++++++++++++++++++++++++++++++++++++ - -To restore the model from checkpoint you should use the following API: - -.. tab-set:: - - .. tab-item:: PyTorch - :sync: pytorch - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py - :language: python - :fragment: [load_checkpoint] - - .. tab-item:: TensorFlow 2 - :sync: tensorflow-2 - - .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py - :language: python - :fragment: [load_checkpoint] - - -For more details on saving/loading checkpoints in the NNCF, see the following `documentation `__. - -Deploying quantized model -######################### - -The quantized model can be deployed with OpenVINO in the same way as the baseline model. No extra steps or options are -required in this case. For more details, see the corresponding :doc:`documentation <../../running-inference>`. - -Examples -#################### - -* `Quantizing PyTorch model with NNCF `__ - -* `Quantizing TensorFlow model with NNCF `__ +:doc:`Quantization-aware Training with PyTorch ` +:doc:`Quantization-aware Training with TensorFlow ` \ No newline at end of file diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training.rst index ae1a87c0260cd1..d34da0d615f3bc 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training.rst @@ -24,7 +24,7 @@ speed, mostly due to reduced throughput. The reduction is performed before the a when the model gets transformed into the quantized representation. The process does not require any training datasets or pipelines in the source DL framework. -.. image:: ../../_static/images/quantization_picture.svg +.. image:: ../../assets/images/quantization_picture.svg `Neural Network Compression Framework (NNCF) `__ provides a post-training quantization API, available in Python, that aims at reusing the code for diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training/basic-quantization-flow.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training/basic-quantization-flow.rst index de0b0f96cc0e1d..2b2136c6a255d2 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training/basic-quantization-flow.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/quantizing-models-post-training/basic-quantization-flow.rst @@ -106,7 +106,7 @@ See the `example section <#examples-of-how-to-apply-nncf-post-training-quantizat After that the model can be converted into the OpenVINO Intermediate Representation (IR) if needed, compiled and run with OpenVINO. -If you have not already installed OpenVINO developer tools, install it with ``pip install openvino-dev``. +If you have not already installed OpenVINO developer tools, install it with ``pip install openvino``. .. tab-set:: diff --git a/docs/articles_en/openvino-workflow/model-optimization.rst b/docs/articles_en/openvino-workflow/model-optimization.rst index 1203b99f6486ea..2cf08990d6b1d6 100644 --- a/docs/articles_en/openvino-workflow/model-optimization.rst +++ b/docs/articles_en/openvino-workflow/model-optimization.rst @@ -17,21 +17,21 @@ Model optimization is an optional offline step of improving the final model perf - :doc:`Post-training Quantization ` is designed to optimize the inference of deep learning models by applying the post-training 8-bit integer quantization that does not require model retraining or fine-tuning. -- :doc:`Training-time Optimization `, a suite of advanced methods for training-time model optimization within the DL framework, such as PyTorch and TensorFlow 2.x. It supports methods like Quantization-aware Training, Structured and Unstructured Pruning, etc. +- :doc:`Training-time Optimization `, a suite of advanced methods for training-time model optimization within the DL framework, such as PyTorch and TensorFlow 2.x. It supports methods like Quantization-aware Training, Structured and Unstructured Pruning, etc. - :doc:`Weight Compression `, an easy-to-use method for Large Language Models footprint reduction and inference acceleration. .. note:: OpenVINO also supports optimized models (for example, quantized) from source frameworks such as PyTorch, TensorFlow, and ONNX (in Q/DQ; Quantize/DeQuantize format). No special steps are required in this case and optimized models can be converted to the OpenVINO Intermediate Representation format (IR) right away. -Post-training Quantization is the fastest way to optimize an arbitrary DL model and should be applied first, but it is limited in terms of achievable accuracy-performance trade-off. The recommended approach to obtain OpenVINO quantized model is to convert a model from original framework to ``ov.Model`` and ensure that the model works correctly in OpenVINO, for example, by calculating the model metrics. Then, ``ov.Model`` can be used as input for the ``nncf.quantize()`` method to get the quantized model (see the diagram below). +Post-training Quantization is the fastest way to optimize an arbitrary DL model and should be applied first, but it is limited in terms of achievable accuracy-performance trade-off. The recommended approach to obtain OpenVINO quantized model is to convert a model from original framework to ``ov.Model`` and ensure that the model works correctly in OpenVINO, for example, by calculating the model metrics. Then, ``ov.Model`` can be used as input for the ``nncf.quantize()`` method to get the quantized model or as input for the ``nncf.compress_weights()`` method to compress weights of Large Language Models (see the diagram below). In case of unsatisfactory accuracy or performance after Post-training Quantization, Training-time Optimization can be used as an option. -.. image:: ../_static/images/DEVELOPMENT_FLOW_V3_crunch.svg +.. image:: ../assets/images/DEVELOPMENT_FLOW_V3_crunch.svg Once the model is optimized using the aforementioned methods, it can be used for inference using the regular OpenVINO inference workflow. No changes to the inference code are required. -.. image:: ../_static/images/WHAT_TO_USE.svg +.. image:: ../assets/images/WHAT_TO_USE.svg Additional Resources #################### diff --git a/docs/articles_en/openvino-workflow/running-inference.rst b/docs/articles_en/openvino-workflow/running-inference.rst index 13e9d650914bb3..3ccd9f3ff7cc2e 100644 --- a/docs/articles_en/openvino-workflow/running-inference.rst +++ b/docs/articles_en/openvino-workflow/running-inference.rst @@ -49,6 +49,6 @@ OpenVINO Runtime uses a plugin architecture. Its plugins are software components The scheme below illustrates the typical workflow for deploying a trained deep learning model: -.. image:: ../_static/images/BASIC_FLOW_IE_C.svg +.. image:: ../assets/images/BASIC_FLOW_IE_C.svg diff --git a/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst b/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst index 6e019d85c35ffb..d8b76ef31545b4 100644 --- a/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst +++ b/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst @@ -47,7 +47,7 @@ to set a new batch size with the ``reshape`` method: The diagram below presents the results of using the method, where the size of model input is changed with an image input: -.. image:: ../../_static/images/original_vs_reshaped_model.svg +.. image:: ../../assets/images/original_vs_reshaped_model.svg When using the ``reshape`` method, you may take one of the approaches: diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst index d20ec78f9407a2..d087e369ff117d 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst @@ -73,7 +73,7 @@ input/output or :doc:`stateful operations <../stateful-models>` will be loaded to the CPU if it is in the candidate list. Otherwise, these models will follow the normal flow and be loaded to the device based on priority. -.. image:: ../../../_static/images/autoplugin_accelerate.svg +.. image:: ../../../assets/images/autoplugin_accelerate.svg This mechanism can be easily observed in the :ref:`Using AUTO with Benchmark app sample ` diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection/debugging-auto-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection/debugging-auto-device.rst index b0cde79f630363..46ff9165d24c11 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection/debugging-auto-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection/debugging-auto-device.rst @@ -103,27 +103,27 @@ With Intel® VTune™ Profiler installed you can configure your analysis with th 3. In the **where** pane, select **Local Host** - .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png + .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img01-localhost.png :align: center 4. In the **what** pane, specify your target application/script on the local system. - .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png + .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img02-launch.png :align: center 5. In the **how** pane, choose and configure the analysis type you want to perform, for example, **Hotspots Analysis**: identify the most time-consuming functions and drill down to see time spent on each line of source code. Focus optimization efforts on hot code for the greatest performance impact. - .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png + .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img03-hotspots.png :align: center 6. Start the analysis by clicking the start button. When it is done, you will get a summary of the run, including top hotspots and top tasks in your application: - .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png + .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img04-vtunesummary.png :align: center 7. To analyze ITT info related to the Auto plugin, click on the **Bottom-up** tab, choose the **Task Domain/Task Type/Function/Call Stack** from the dropdown list - Auto plugin-related ITT info is under the MULTIPlugin task domain: - .. image:: ../../../../_static/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png + .. image:: ../../../../assets/images/OV_UG_supported_plugins_AUTO_debugging-img05-vtunebottomup.png :align: center diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst index 2d72b49fcf225a..6f817349800590 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst @@ -101,6 +101,11 @@ On platforms that natively support half-precision calculations (``bfloat16`` or of ``f32`` to achieve better performance (see the `Execution Mode Hint <#execution-mode-hint>`__). Thus, no special steps are required to run a model with ``bf16`` or ``f16`` inference precision. +.. important:: + + The ``bf16`` floating-point precision appears to have some limitations that impact the + inference accuracy in LLM models. For more details, refer to this :ref:`article `. + Using the half-precision provides the following performance benefits: - ``bfloat16`` and ``float16`` data types enable Intel® Advanced Matrix Extension (AMX) on 4+ generation Intel® Xeon® Scalable Processors, resulting in significantly faster computations on the corresponding hardware compared to AVX512 or AVX2 instructions in many deep learning operation implementations. diff --git a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst index 3d62354ff51586..ce5e6fd20722a1 100644 --- a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst +++ b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst @@ -26,7 +26,7 @@ for Windows PowerShell, or ``setupvars.bat`` for Windows CMD). Otherwise, the `` variable won't be configured properly to pass ``find_package`` calls. -.. image:: ../../_static/images/IMPLEMENT_PIPELINE_with_API_C.svg +.. image:: ../../assets/images/IMPLEMENT_PIPELINE_with_API_C.svg Step 1. Create OpenVINO Runtime Core diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst index 080c297ed2565a..d7520f57315ab0 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst @@ -43,7 +43,7 @@ The key advantage of the Async approach is that when a device is busy with the i In the example below, inference is applied to the results of the video decoding. It is possible to keep two parallel infer requests, and while the current one is processed, the input frame for the next one is being captured. This essentially hides the latency of capturing, so that the overall frame rate is rather determined only by the slowest part of the pipeline (decoding vs inference) and not by the sum of the stages. -.. image:: ../../../_static/images/synch-vs-asynch.svg +.. image:: ../../../assets/images/synch-vs-asynch.svg :alt: Intel® VTune™ screenshot Below are example-codes for the regular and async-based approaches to compare: diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst index f3431bfd10b135..7d19e17a70f2c6 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing.rst @@ -35,7 +35,7 @@ Consider the following standard example: deep learning model expects input with * For each pixel, subtract mean values and divide by scale factor. -.. image:: ../../../_static/images/preprocess_not_fit.png +.. image:: ../../../assets/images/preprocess_not_fit.png Even though it is relatively easy to implement all these steps in the application code manually, before actual inference, it is also possible with the use of Preprocessing API. Advantages of using the API are: diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst index d24d817e760f46..38af00d3796d5d 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst @@ -67,7 +67,7 @@ If the device does not support the import/export capability, cache is not create Note that the first ``compile_model`` operation takes slightly longer, as the cache needs to be created - the compiled blob is saved into a cache file: -.. image:: ../../../../_static/images/caching_enabled.svg +.. image:: ../../../../assets/images/caching_enabled.svg Make it even faster: use compile_model(modelPath) @@ -113,7 +113,7 @@ With model caching enabled, the total load time is even shorter, if ``read_model :fragment: [ov:caching:part2] -.. image:: ../../../../_static/images/caching_times.svg +.. image:: ../../../../assets/images/caching_times.svg Advanced Examples ++++++++++++++++++++ diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-low-level-implementation.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-low-level-implementation.rst index f948ca0c590d4b..1259b65fe04c49 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-low-level-implementation.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-low-level-implementation.rst @@ -5,8 +5,8 @@ Further Low-Level Implementation Details .. meta:: - :description: Automatic Batching moves asynchronicity from individual - requests to groups of requests, and the CPU streams are + :description: Automatic Batching moves asynchronicity from individual + requests to groups of requests, and the CPU streams are inference threads grouped by CPU cores. @@ -32,9 +32,9 @@ This provides much better performance for the networks than batching, especially | Requests are executed in parallel with a small number of threads. | Layer-wise, the streams imply much less synchronization. -.. |conventional-approach| image:: ../../../_static/images/cpu_execution_conventional_approach.svg +.. |conventional-approach| image:: ../../../assets/images/cpu_execution_conventional_approach.svg -.. |execution-streams| image:: ../../../_static/images/cpu_execution_streams.svg +.. |execution-streams| image:: ../../../assets/images/cpu_execution_streams.svg Compared to the batching, the parallelism is somewhat transposed (performed over inputs with much less synchronization within CNN ops): @@ -53,9 +53,9 @@ Compared to the batching, the parallelism is somewhat transposed (performed over - | |execution-streams-2| | Inputs-wise the streams are the “transposed” batch. -.. |large-batch-approach| image:: ../../../_static/images/large_batch_approach.svg +.. |large-batch-approach| image:: ../../../assets/images/large_batch_approach.svg -.. |execution-streams-2| image:: ../../../_static/images/cpu_execution_streams_2.svg +.. |execution-streams-2| image:: ../../../assets/images/cpu_execution_streams_2.svg Keep in mind that :doc:`high-level performance hints ` allow the implementation to select the optimal number of streams depending on model's compute demands and CPU capabilities, including :doc:`int8 inference <../../model-optimization>` hardware acceleration, number of cores, etc. @@ -63,15 +63,15 @@ Keep in mind that :doc:`high-level performance hints ` performs on-the-fly grouping of inference requests to improve device utilization. +:doc:`Automatic batching <../inference-devices-and-modes/automatic-batching>` performs on-the-fly grouping of inference requests to improve device utilization. It relaxes the requirement for an application to saturate devices such as GPU by using a large batch "explicitly". It performs transparent input gathering from individual inference requests followed by the actual batched execution, with no programming effort from the user: -.. image:: ../../../_static/images/batch_device.svg +.. image:: ../../../assets/images/batch_device.svg -Essentially, Automatic Batching shifts asynchronicity from individual requests to groups of requests that constitute the batches. Furthermore, for the execution to be efficient, it is very important that the requests arrive timely, without causing a batching timeout. +Essentially, Automatic Batching shifts asynchronicity from individual requests to groups of requests that constitute the batches. Furthermore, for the execution to be efficient, it is very important that the requests arrive timely, without causing a batching timeout. Normally, the timeout should never be hit. It is rather a graceful way to handle the application exit (when the inputs are not arriving anymore, so the full batch is not possible to collect). If a workload experiences timeouts, which lead to a drop in performance due to increased latency of every request, consider balancing its value against the batch size. For example, a smaller batch size and timeout value may yield better results than a large batch size coupled with a timeout value that cannot guarantee accommodating all the required requests. -Finally, following the ``get_tensor`` idiom section from the :doc:`general optimizations ` helps Automatic Batching to save on inputs/outputs copies. According to that, you should always prefer the "get" versions of the tensors' data access APIs in your applications. +Finally, following the ``get_tensor`` idiom section from the :doc:`general optimizations ` helps Automatic Batching to save on inputs/outputs copies. According to that, you should always prefer the "get" versions of the tensors' data access APIs in your applications. diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/precision-control.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/precision-control.rst index afc333c89504ba..944b6de3032d60 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/precision-control.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/precision-control.rst @@ -18,18 +18,40 @@ of the weights, and it does not affect how the devices execute the model. This c a lot of confusion where, for example, you couldn't execute a high-performance model on the GPU by default, and the behavior between devices was different. -This guide will focus on how to control inference precision. And using lower precision is important for performance because compute bandwidth tends to be higher for smaller data types, and hardware often has special blocks for efficient multiply-accumulate operations with smaller data types only (e.g. Intel Xᵉ Matrix Extensions (XMX) on GPU and Intel Advanced Matrix Extensions (AMX) on CPU do not support ``f32``). Also, I/O operations requires less memory due to the smaller tensor byte size. This guide will focus on how to control inference precision. +This guide will focus on how to control inference precision. And using lower precision is +important for performance because compute bandwidth tends to be higher for smaller data +types, and hardware often has special blocks for efficient multiply-accumulate operations +with smaller data types only (e.g. Intel Xᵉ Matrix Extensions (XMX) on GPU and Intel +Advanced Matrix Extensions (AMX) on CPU do not support ``f32``). Also, I/O operations +requires less memory due to the smaller tensor byte size. This guide will focus on how +to control inference precision. Execution Mode ############## -``ov::hint::execution_mode`` is a high-level hint to control whether the user wants to keep the best accuracy (**ACCURACY mode**) or if the device can do some optimizations that may lower the accuracy for performance reasons (**PERFORMANCE mode**) - -* In **ACCURACY mode**, the device cannot convert floating point tensors to a smaller floating point type, so devices try to keep the accuracy metrics as close as possible to the original values ​​obtained after training relative to the device's real capabilities. This means that most devices will infer with ``f32`` precision if your device supports it. -* In **PERFORMANCE mode**, the device can convert to smaller data types and apply other optimizations that may have some impact on accuracy rates, although we still try to minimize accuracy loss and may use mixed precision execution in some cases. - -If the model has been quantized using :doc:`OpenVINO optimization tools <../../model-optimization-guide/quantizing-models-post-training>` or any other method, the quantized operators will be executed with the target integer precision if the device has hardware acceleration for that type. For example, quantized ``int8`` primitives are executed with ``int8`` precision for both **ACCURACY** and **PERFORMANCE modes** if the device provides higher compute bandwidth for 8-bit data types compared to any available floating-point type. On the other hand, devices without hardware acceleration for the ``int8`` data type can keep such operators in floating point precision, and the exact floating point type will be affected by ``execution_mode`` and ``inference_precision`` properties. +``ov::hint::execution_mode`` is a high-level hint to control whether the user wants to keep +the best accuracy (**ACCURACY mode**) or if the device can do some optimizations that +may lower the accuracy for performance reasons (**PERFORMANCE mode**) + +* In **ACCURACY mode**, the device cannot convert floating point tensors to a smaller + floating point type, so devices try to keep the accuracy metrics as close as possible to + the original values ​​obtained after training relative to the device's real capabilities. + This means that most devices will infer with ``f32`` precision if your device supports it. +* In **PERFORMANCE mode**, the device can convert to smaller data types and apply other + optimizations that may have some impact on accuracy rates, although we still try to + minimize accuracy loss and may use mixed precision execution in some cases. + +If the model has been quantized using +:doc:`OpenVINO optimization tools <../../model-optimization-guide/quantizing-models-post-training>` +or any other method, the quantized operators will be executed with the target integer +precision if the device has hardware acceleration for that type. For example, quantized +``int8`` primitives are executed with ``int8`` precision for both **ACCURACY** and +**PERFORMANCE modes** if the device provides higher compute bandwidth for 8-bit data types +compared to any available floating-point type. On the other hand, devices without hardware +acceleration for the ``int8`` data type can keep such operators in floating point precision, +and the exact floating point type will be affected by ``execution_mode`` and +``inference_precision`` properties. Code examples: @@ -53,11 +75,43 @@ Code examples: Inference Precision ################### -``ov::hint::inference_precision`` precision is a lower-level property that allows you to specify the exact precision the user wants, but is less portable. For example, CPU supports ``f32`` inference precision and ``bf16`` on some platforms, GPU supports ``f32`` and ``f16``, so if a user wants to an application that uses multiple devices, they have to handle all these combinations manually or let OV do it automatically by using higher level ``execution_mode`` property. Another thing is that ``inference_precision`` is also a hint, so the value provided is not guaranteed to be used by Runtime (mainly in cases where the current device does not have the required hardware capabilities). +``ov::hint::inference_precision`` precision is a lower-level property that allows you +to specify the exact precision the user wants, but is less portable. For example, CPU +supports ``f32`` inference precision and ``bf16`` on some platforms, GPU supports ``f32`` +and ``f16``, so if a user wants to an application that uses multiple devices, they have +to handle all these combinations manually or let OV do it automatically by using higher +level ``execution_mode`` property. Another thing is that ``inference_precision`` is also +a hint, so the value provided is not guaranteed to be used by Runtime (mainly in cases +where the current device does not have the required hardware capabilities). .. note:: - All devices only support floating-point data types (``f32``, ``f16``, ``bf16``) as a value for ``inference_precision`` attribute, because quantization cannot be done in Runtime. + All devices only support floating-point data types (``f32``, ``f16``, ``bf16``) as a value + for ``inference_precision`` attribute, because quantization cannot be done in Runtime. + + +.. _limited_inference_precision: + +Limitation of the ``bf16`` inference precision +++++++++++++++++++++++++++++++++++++++++++++++ + +It is important to mention that inferring FP16 and FP32 LLM models with the ``bf16`` runtime +precision may result in higher accuracy loss than the pre-determined threshold of 0.5%. +Higher accuracy drop may occur when inferring **dolly-v2-12b**, **dolly-v2-3b**, and +**gpt-neox-20b** original Pytorch models with ``bf16``, and is caused by a limited +precision representation. + +To solve the issue, you might use an INT8 model and force the FP32 inference precision. +The accuracy of an INT8 model with FP32 is nearly the same as of an FP16 model with ``f32``. +Additionally, selective FP32 execution of ops on CPU plugin together with the NNCF ``bf16`` +calibration could potentially mitigate the accuracy loss. + +However, the solutions mentioned above would, unfortunately, also result in significant +performance drop during a large batch size inference task on machines with Intel AMX-BF16 SPR. +In such cases, the fused multiply-add operation (FMA) is used instead of AMX. Also, +in a compute-bound case, such as the LLM batch inference/serving, these workarounds +would drastically reduce the throughput by more than 60%. + Additional Resources diff --git a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst index eec850fa1cd18b..49c70cb964cb87 100644 --- a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst +++ b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst @@ -16,7 +16,7 @@ output. In contrast, for a "stateless" model to pass data between runs, all prod returned as output and needs to be handled by the application itself for reuse at the next execution. -.. image:: ../../_static/images/stateful_model_example.svg +.. image:: ../../assets/images/stateful_model_example.svg :alt: example comparison between stateless and stateful model implementations :align: center :scale: 90 % @@ -113,7 +113,7 @@ states. each run performed in a different infer request than the previous one would require the state to be set "manually", using the ``ov::VariableState::set_state`` method. -.. image:: ../../_static/images/stateful_model_init_subgraph.svg +.. image:: ../../assets/images/stateful_model_init_subgraph.svg :alt: diagram of how initial state value is set or reset :align: center :scale: 100 % diff --git a/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst b/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst index 2c005a9bd8a3f3..67e70c9b999f0c 100644 --- a/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst +++ b/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst @@ -28,7 +28,7 @@ MakeStateful Transformation The MakeStateful transformation changes the structure of the model by replacing the user-defined pairs of Parameter and Results with the Assign and ReadValue operations: -.. image:: ../../../_static/images/make_stateful_simple.svg +.. image:: ../../../assets/images/make_stateful_simple.svg :alt: diagram of MakeStateful Transformation :scale: 90 % :align: center @@ -44,7 +44,7 @@ Parameter/Result tensor names. If there are no tensor names, **Examples:** -.. image:: ../../../_static/images/make_stateful_detailed.png +.. image:: ../../../assets/images/make_stateful_detailed.png :alt: detailed diagram of MakeStateful Transformation :align: center @@ -91,7 +91,7 @@ and :doc:`Loop <../../../documentation/openvino-ir-format/operation-sets/operati and replacing pairs of Parameter and Results with the Assign and ReadValue operations, as illustrated by the following example: -.. image:: ../../../_static/images/applying_low_latency_2.svg +.. image:: ../../../assets/images/applying_low_latency_2.svg :alt: diagram of LowLatency Transformation :align: center @@ -162,7 +162,7 @@ Applying LowLatency2 Transformation :fragment: [ov:low_latency_2_use_parameters] - .. image:: ../../../_static/images/llt2_use_const_initializer.svg + .. image:: ../../../assets/images/llt2_use_const_initializer.svg :alt: diagram of constant subgraph initialization :align: center @@ -184,7 +184,7 @@ Applying LowLatency2 Transformation 4. Use state API. See sections :doc:`OpenVINO State API <../stateful-models>`, :ref:`Stateful Model Inference `. - .. image:: ../../../_static/images/low_latency_limitation_2.svg + .. image:: ../../../assets/images/low_latency_limitation_2.svg :alt: diagram showing low latency limitation :scale: 70 % :align: center diff --git a/docs/articles_en/openvino-workflow/torch-compile.rst b/docs/articles_en/openvino-workflow/torch-compile.rst index 02e2364c339a94..57682f2e143cd9 100644 --- a/docs/articles_en/openvino-workflow/torch-compile.rst +++ b/docs/articles_en/openvino-workflow/torch-compile.rst @@ -35,7 +35,7 @@ any additional PyTorch-based tracing/scripting. Execution diagram: -.. image:: ../_static/images/torch_compile_backend_openvino.svg +.. image:: ../assets/images/torch_compile_backend_openvino.svg :width: 992px :height: 720px :scale: 60% diff --git a/docs/home.rst b/docs/home.rst index 524a3ce5b48345..08b2e8d62cc340 100644 --- a/docs/home.rst +++ b/docs/home.rst @@ -57,7 +57,7 @@ Check out the `OpenVINO Cheat Sheet. a { font-size: 2rem!important; } -.bd-header .navbar-nav li a.nav-link:hover { - color: white; - text-decoration: none; -} - -.bd-links__title { +.svg-inline--fa .fa-outdent { display: none; + visibility: none; + color: white; } li.toctree-l1.has-children > a.reference.internal { @@ -196,6 +202,11 @@ nav.bd-links li>a { } } +.download-docs .sst-dropdown .sst-btn { + border-color: lightgray !important; +} + + /* Moving dropdown arrows to the left */ details.sd-dropdown .sd-summary-up, details.sd-dropdown .sd-summary-down { @@ -209,6 +220,11 @@ details.sd-dropdown:not([open]).sd-card { padding: 0px; } +/* Ttile is at the same place for both open and close states */ +.sd-card-header { + border-radius: 0px !important; + +} /* Ttile is at the same place for both open and close states */ details.sd-dropdown[open].sd-card { @@ -220,8 +236,6 @@ details.sd-dropdown .sd-summary-title { padding-left: 40px; } - - /* Second level items */ #bd-docs-nav > div > ul > li > ul { padding-left: 0.3rem; @@ -264,7 +278,6 @@ details.sd-dropdown .sd-summary-title { padding-right: 10px!important; } - /* Code reference text formatting override */ /* =================================================== */ code { @@ -283,9 +296,28 @@ code { background-color: #0054AE !important; } -.admonition { - border-radius:0px !important; +.admonition.tip, div.admonition.tip { + border-color: var(--pst-color-success) !important; + background-color: #effdf6 !important; } + +.admonition.important, div.admonition.important { + border-color: var(--pst-color-attention); + background-color: #fbf5f0 !important; +} + +.admonition.warning, div.admonition.warning { + background-color: #fbf5f0 !important; +} + +.admonition.note, div.admonition.note { + background-color: #f1fafe; +} + +details.sd-dropdown summary.sd-card-header+div.sd-summary-content { + background-color: rgb(242, 248, 251); +} + /* Table Sort Button */ /* =================================================== */ .sort-header { diff --git a/docs/sphinx_setup/_static/images/applying_low_latency.svg b/docs/sphinx_setup/_static/images/applying_low_latency.svg deleted file mode 100644 index 68ab0c24149491..00000000000000 --- a/docs/sphinx_setup/_static/images/applying_low_latency.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3cec32fe436ce551bbd91a60eac39588f5ab9b599d14c6496b89a4e8a9a37909 -size 266752 diff --git a/docs/sphinx_setup/_static/images/configuration_dialog.png b/docs/sphinx_setup/_static/images/configuration_dialog.png deleted file mode 100644 index 349fafc25e387f..00000000000000 --- a/docs/sphinx_setup/_static/images/configuration_dialog.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55b68c91d4991dff5965d19e9b637848bbdcb49e75dbaae6af11d58fde7cf846 -size 20433 diff --git a/docs/sphinx_setup/_static/images/download_btn_github.svg b/docs/sphinx_setup/_static/images/download_btn_github.svg deleted file mode 100644 index da039dacffccb5..00000000000000 --- a/docs/sphinx_setup/_static/images/download_btn_github.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c00e5b644b75ac4fe82907567b684552f703540bdd2948413b9a24d0a6762492 -size 1350 diff --git a/docs/sphinx_setup/_static/images/download_btn_installer.svg b/docs/sphinx_setup/_static/images/download_btn_installer.svg deleted file mode 100644 index bb6fc4d16059ff..00000000000000 --- a/docs/sphinx_setup/_static/images/download_btn_installer.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:467a378cfe3fdfd298a195be51054f025d907696904905a899d5cfb1ba0532a2 -size 727 diff --git a/docs/sphinx_setup/_static/images/gapi_development_workflow.png b/docs/sphinx_setup/_static/images/gapi_development_workflow.png deleted file mode 100644 index 658fdafe87a60a..00000000000000 --- a/docs/sphinx_setup/_static/images/gapi_development_workflow.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0a11bedbfe2df3352b064e80498aa39fbc3817eaf99439865a090f34501e44a -size 25936 diff --git a/docs/sphinx_setup/_static/images/gapi_face_analytics_pipeline.png b/docs/sphinx_setup/_static/images/gapi_face_analytics_pipeline.png deleted file mode 100644 index 31f045c5d77ca2..00000000000000 --- a/docs/sphinx_setup/_static/images/gapi_face_analytics_pipeline.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:662a823fbef3be0cca1755de9118e73b4137fe7ec4b7cb6a389e64b9ec5a9c13 -size 13511 diff --git a/docs/sphinx_setup/_static/images/gapi_face_beautification_algorithm.png b/docs/sphinx_setup/_static/images/gapi_face_beautification_algorithm.png deleted file mode 100644 index 7693c3b0fd825e..00000000000000 --- a/docs/sphinx_setup/_static/images/gapi_face_beautification_algorithm.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12fe8e0b841aa6759f3b1975d3a877e65b8d72b752d11ffd212b67d11e62e048 -size 19539 diff --git a/docs/sphinx_setup/_static/images/gapi_face_beautification_example.jpg b/docs/sphinx_setup/_static/images/gapi_face_beautification_example.jpg deleted file mode 100644 index eb3df6b58785bf..00000000000000 --- a/docs/sphinx_setup/_static/images/gapi_face_beautification_example.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb32d3db8768ff157daeff999cc7f4361d2bca866ed6dc95b8f78d8cc62ae208 -size 176525 diff --git a/docs/sphinx_setup/_static/images/gapi_kernel_implementation_hierarchy.png b/docs/sphinx_setup/_static/images/gapi_kernel_implementation_hierarchy.png deleted file mode 100644 index f910caa840d191..00000000000000 --- a/docs/sphinx_setup/_static/images/gapi_kernel_implementation_hierarchy.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f291422f562825d4c5eee718b7c22e472b02a5a0a9c0be01d59b6b7cd8d756b1 -size 14603 diff --git a/docs/sphinx_setup/_static/images/gapi_programming_model.png b/docs/sphinx_setup/_static/images/gapi_programming_model.png deleted file mode 100644 index 2ac10dcc82c13f..00000000000000 --- a/docs/sphinx_setup/_static/images/gapi_programming_model.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:925f70ede92d71e16733d78e003f62cd8bfdee0790bddbf2b7ce4fc8ef3f44bf -size 171518 diff --git a/docs/sphinx_setup/_static/images/github.png b/docs/sphinx_setup/_static/images/github.png deleted file mode 100644 index 4bf56a3d3e4799..00000000000000 --- a/docs/sphinx_setup/_static/images/github.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b2749d93dff16fc9062a0fa01fd694bf19385a0b4f3d0e409eb56f2648e3cfc -size 11929 diff --git a/docs/sphinx_setup/_static/images/head_banner.jpg b/docs/sphinx_setup/_static/images/head_banner.jpg deleted file mode 100644 index 45773d26369077..00000000000000 --- a/docs/sphinx_setup/_static/images/head_banner.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:07de2ee0d18f1d40afe1f2bb5420c7060eff09026a9138399d21b49b35cc0b8e -size 184552 diff --git a/docs/sphinx_setup/_static/images/img/import_pytorch.svg b/docs/sphinx_setup/_static/images/img/import_pytorch.svg deleted file mode 100644 index d1f8f5030e6566..00000000000000 --- a/docs/sphinx_setup/_static/images/img/import_pytorch.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7419b60d37a9bc058626c52fcbfec20c3a5d22c6d0875fb84ef0df7ec2a68671 -size 142191 diff --git a/docs/sphinx_setup/_static/images/img/import_tensorflow.svg b/docs/sphinx_setup/_static/images/img/import_tensorflow.svg deleted file mode 100644 index 40d0534168133a..00000000000000 --- a/docs/sphinx_setup/_static/images/img/import_tensorflow.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5666c2ee7503bc2844a99f73c1b64afacd2c42dadef441ce115cc18b00922c7 -size 224644 diff --git a/docs/sphinx_setup/_static/images/img/openvino-logo-purple-black.png b/docs/sphinx_setup/_static/images/img/openvino-logo-purple-black.png deleted file mode 100644 index 6248a7820c50f7..00000000000000 --- a/docs/sphinx_setup/_static/images/img/openvino-logo-purple-black.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61e237b3ced7eaa0cf1f8c2688753867b172712925068a4a47e07b5c71e48bdf -size 89866 diff --git a/docs/sphinx_setup/_static/images/import_pytorch.svg b/docs/sphinx_setup/_static/images/import_pytorch.svg deleted file mode 100644 index d1f8f5030e6566..00000000000000 --- a/docs/sphinx_setup/_static/images/import_pytorch.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7419b60d37a9bc058626c52fcbfec20c3a5d22c6d0875fb84ef0df7ec2a68671 -size 142191 diff --git a/docs/sphinx_setup/_static/images/import_tensorflow.svg b/docs/sphinx_setup/_static/images/import_tensorflow.svg deleted file mode 100644 index 40d0534168133a..00000000000000 --- a/docs/sphinx_setup/_static/images/import_tensorflow.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5666c2ee7503bc2844a99f73c1b64afacd2c42dadef441ce115cc18b00922c7 -size 224644 diff --git a/docs/sphinx_setup/_static/images/low_latency_limitation_1.svg b/docs/sphinx_setup/_static/images/low_latency_limitation_1.svg deleted file mode 100644 index 90f947b28c9754..00000000000000 --- a/docs/sphinx_setup/_static/images/low_latency_limitation_1.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcc7af1fddf966fba75aece332e1fabb722ef780d6935ada2ddbcf3bb229223e -size 114289 diff --git a/docs/sphinx_setup/_static/images/model_conversion_diagram.svg b/docs/sphinx_setup/_static/images/model_conversion_diagram.svg deleted file mode 100644 index 8bb8d171bd1eee..00000000000000 --- a/docs/sphinx_setup/_static/images/model_conversion_diagram.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab1d83dbd1546cb8eaada19501cf08d26e3ca1e2ce72fce63356e897fa26750e -size 253024 diff --git a/docs/sphinx_setup/_static/images/notebook_eye.png b/docs/sphinx_setup/_static/images/notebook_eye.png deleted file mode 100644 index ecc13e7bdfba89..00000000000000 --- a/docs/sphinx_setup/_static/images/notebook_eye.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1a2e58cf3e5703356b0e060ebc7cb0cbb852db9cde003d41c1d86bafc3a4ccb1 -size 68559 diff --git a/docs/sphinx_setup/_static/images/openvino-install.png b/docs/sphinx_setup/_static/images/openvino-install.png deleted file mode 100644 index adc5242bbb2fc4..00000000000000 --- a/docs/sphinx_setup/_static/images/openvino-install.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef87640e224de61f41e76541e22a1392c84827dd0b7f70f3c616d86e75456aef -size 8508 diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-cli.png b/docs/sphinx_setup/_static/images/openvino-uninstall-cli.png deleted file mode 100644 index 654b79a5451a39..00000000000000 --- a/docs/sphinx_setup/_static/images/openvino-uninstall-cli.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd260d96e1d8d425fba1eb2caf8b920e9c0511b421a81909babddca4ffa42dcb -size 37617 diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-linux.png b/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-linux.png deleted file mode 100644 index 57d514d1c182b9..00000000000000 --- a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-linux.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d0d7c0f692e14f3bb90d924d5ca25175e961963dac1d9a2dc6ca034f44d15863 -size 35667 diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-macos.png b/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-macos.png deleted file mode 100644 index 672acf2468d7b6..00000000000000 --- a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-macos.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a67b8d8a8aafcb14e4334df138f526ace9a243e297511a0e89b3f0fafcaf003e -size 33892 diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-win.png b/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-win.png deleted file mode 100644 index 1d589ce2ad0ed0..00000000000000 --- a/docs/sphinx_setup/_static/images/openvino-uninstall-dropdown-win.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5847979227bd81d4f8d1d5be532acd81c056466e226204be205565d00b69fa34 -size 34976 diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-linux.png b/docs/sphinx_setup/_static/images/openvino-uninstall-linux.png deleted file mode 100644 index d22bee18602d7c..00000000000000 --- a/docs/sphinx_setup/_static/images/openvino-uninstall-linux.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f276ad34326176aec19e93d3a277ede95096530e675991e71865b6edb6a5469 -size 42777 diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-macos.png b/docs/sphinx_setup/_static/images/openvino-uninstall-macos.png deleted file mode 100644 index ebfbe68495fb3f..00000000000000 --- a/docs/sphinx_setup/_static/images/openvino-uninstall-macos.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:398636f71383bb2feff5492fcff3dcd7c7b30b155b7a7c219755d8bc40ef788c -size 27305 diff --git a/docs/sphinx_setup/_static/images/openvino-uninstall-win.png b/docs/sphinx_setup/_static/images/openvino-uninstall-win.png deleted file mode 100644 index 96206bf1bdfec8..00000000000000 --- a/docs/sphinx_setup/_static/images/openvino-uninstall-win.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5994af2e6d7c5403151e1dd3ed5741809590787b4490518b040bb30fe30d4cf3 -size 46941 diff --git a/docs/sphinx_setup/_static/images/range_supervision/img_combined_2.png b/docs/sphinx_setup/_static/images/range_supervision/img_combined_2.png deleted file mode 100644 index 039e9a324d1d7b..00000000000000 --- a/docs/sphinx_setup/_static/images/range_supervision/img_combined_2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:271ba164a9726a5cf8d577f02db258c76df94e9ff79c3bebf95371ebdaa7d82d -size 1719169 diff --git a/docs/sphinx_setup/_static/images/range_supervision/scheme3.svg b/docs/sphinx_setup/_static/images/range_supervision/scheme3.svg deleted file mode 100644 index 4504c1d3b94758..00000000000000 --- a/docs/sphinx_setup/_static/images/range_supervision/scheme3.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2545abc4e5d26f6eb52c832cafa9ac1319958dfd7d550922e13cfcab44f1379c -size 68280 diff --git a/docs/sphinx_setup/_static/images/selection_dialog.png b/docs/sphinx_setup/_static/images/selection_dialog.png deleted file mode 100644 index 82ae960c8ad518..00000000000000 --- a/docs/sphinx_setup/_static/images/selection_dialog.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:194b9b8026456b9bb7d05834ffebc44192e028c0338369f0c77afc4772192a01 -size 18851 diff --git a/docs/sphinx_setup/_static/images/state_network_example.svg b/docs/sphinx_setup/_static/images/state_network_example.svg deleted file mode 100644 index 56d695015077bd..00000000000000 --- a/docs/sphinx_setup/_static/images/state_network_example.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8665a06ce99edcb4ccdade52b7fc5e2ae2a4810c5099cc35ffbb23d48fae56b5 -size 16970 diff --git a/docs/sphinx_setup/_static/images/supported_devices.png b/docs/sphinx_setup/_static/images/supported_devices.png deleted file mode 100644 index ff117bd8d61f34..00000000000000 --- a/docs/sphinx_setup/_static/images/supported_devices.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:566aab6ef86a50dad4fba5483a9b0abffc85778dccee7a0c7e98d4b09447f9b1 -size 130586 diff --git a/docs/sphinx_setup/_static/images/torch_compile_backend_openvino_ts.svg b/docs/sphinx_setup/_static/images/torch_compile_backend_openvino_ts.svg deleted file mode 100644 index 4be98857e767f5..00000000000000 --- a/docs/sphinx_setup/_static/images/torch_compile_backend_openvino_ts.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0e26fe889ada0e02a3bbc03e451a7e1d4b06037723349971efff1d721b5e13f6 -size 117253 diff --git a/docs/sphinx_setup/_static/js/modern.js b/docs/sphinx_setup/_static/js/modern.js new file mode 100644 index 00000000000000..dae212a07215db --- /dev/null +++ b/docs/sphinx_setup/_static/js/modern.js @@ -0,0 +1,12 @@ +$(document).ready(function() { + const elems = $( 'details.sd-dropdown' ); + for(let i = 0; i < elems.length; i++){ + elems[i].style.cssText = 'box-shadow: none !important; border: 1px !important;' + } + + + const admonitions = $( '.admonition' ); + for(let i = 0; i < admonitions.length; i++){ + admonitions[i].style.cssText = 'box-shadow: none !important; border-radius:0px !important; ' + } +}) \ No newline at end of file diff --git a/docs/sphinx_setup/conf.py b/docs/sphinx_setup/conf.py index c85a612be760b9..669e15760ae27b 100644 --- a/docs/sphinx_setup/conf.py +++ b/docs/sphinx_setup/conf.py @@ -199,6 +199,7 @@ 'js/papaparse.min.js', 'js/viewer.min.js', 'js/custom.js', + 'js/modern.js', ] # monkeypatch sphinx api doc to prevent showing inheritance from object and enum.Enum diff --git a/docs/sphinx_setup/index.rst b/docs/sphinx_setup/index.rst index 3b4c1d48347602..fcccf196e94fdf 100644 --- a/docs/sphinx_setup/index.rst +++ b/docs/sphinx_setup/index.rst @@ -57,7 +57,7 @@ Check out the `OpenVINO Cheat Sheet. object->get_element_type(); - *tensor_type = (ov_element_type_e)type; + *tensor_type = find_ov_element_type_e(type); } CATCH_OV_EXCEPTIONS diff --git a/src/bindings/c/src/ov_tensor.cpp b/src/bindings/c/src/ov_tensor.cpp index 3ad7d408add000..952f9cb394ba92 100644 --- a/src/bindings/c/src/ov_tensor.cpp +++ b/src/bindings/c/src/ov_tensor.cpp @@ -19,16 +19,20 @@ const std::map element_type_map = { {ov_element_type_e::I32, ov::element::i32}, {ov_element_type_e::I64, ov::element::i64}, {ov_element_type_e::U1, ov::element::u1}, + {ov_element_type_e::U2, ov::element::u2}, + {ov_element_type_e::U3, ov::element::u3}, {ov_element_type_e::U4, ov::element::u4}, + {ov_element_type_e::U6, ov::element::u6}, {ov_element_type_e::U8, ov::element::u8}, {ov_element_type_e::U16, ov::element::u16}, {ov_element_type_e::U32, ov::element::u32}, {ov_element_type_e::U64, ov::element::u64}, {ov_element_type_e::NF4, ov::element::nf4}, {ov_element_type_e::F8E4M3, ov::element::f8e4m3}, - {ov_element_type_e::F8E5M3, ov::element::f8e5m2}}; + {ov_element_type_e::F8E5M3, ov::element::f8e5m2}, + {ov_element_type_e::STRING, ov::element::string}}; -inline ov_element_type_e find_ov_element_type_e(ov::element::Type type) { +ov_element_type_e find_ov_element_type_e(ov::element::Type type) { for (auto iter = element_type_map.begin(); iter != element_type_map.end(); iter++) { if (iter->second == type) { return iter->first; diff --git a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py index 56f5cd79a61480..596c4415868910 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py @@ -56,6 +56,15 @@ def __init__(self, pt_module, fx_gm=None, nodes=None, mark_node_callback=None, i uargs = self.unpack_containers(self._nodes[i].args) self._outputs = [(arg[0], self._nodes.index(arg[1])) for arg in uargs if arg[1] is not None] + for idx, shape in enumerate(found_shapes): + if shape is not None: + new_shape=[] + for dim in range(0, len(shape)): + if (type(shape[dim]).__name__ == "SymInt"): + new_shape.append(-1) + else: + new_shape.append(shape[dim]) + found_shapes[idx] = torch.Size(new_shape) if not input_shapes or len(input_shapes) == 0: self.input_shapes = found_shapes diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py index fb7438aa78295e..4947589a77fd22 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py @@ -54,7 +54,11 @@ def openvino(subgraph, example_inputs, options=None): if (_get_aot_autograd(options)): global openvino_options openvino_options = options - return aot_autograd(fw_compiler=fx_openvino, bw_compiler=fx_openvino)(subgraph, example_inputs) + decompositions = _get_decompositions(options) + get_inf_decomposition_list() + decompositions = decompositions + get_aot_decomposition_list() + return aot_autograd(fw_compiler=fx_openvino, + bw_compiler=fx_openvino, + decompositions=get_decompositions(decompositions))(subgraph, example_inputs) return fx_openvino(subgraph, example_inputs, options) def fx_openvino(subgraph, example_inputs, options=None): @@ -82,15 +86,17 @@ def _call(*args): if inputs_reversed: example_inputs.reverse() - from torch._subclasses.fake_tensor import FakeTensorMode - decompositions = _get_decompositions(options) + get_inf_decomposition_list() if (_get_aot_autograd(options)): - decompositions = decompositions + get_aot_decomposition_list() - with FakeTensorMode(allow_non_fake_inputs=True): - model = make_fx(subgraph, decomposition_table=get_decompositions(decompositions))(*example_inputs) + model = subgraph + else: + from torch._subclasses.fake_tensor import FakeTensorMode + decompositions = _get_decompositions(options) + get_inf_decomposition_list() + with FakeTensorMode(allow_non_fake_inputs=True): + model = make_fx(subgraph, decomposition_table=get_decompositions(decompositions))(*example_inputs) + + with torch.no_grad(): + model.eval() - with torch.no_grad(): - model.eval() partitioner = Partitioner(options) compiled_model = partitioner.make_partitions(model, options) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py index 91192e4110d2bb..fa446893a05d07 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/compile.py @@ -94,10 +94,14 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, options input_shapes = [] input_types = [] for idx, input_data in enumerate(args): - input_types.append(input_data.type()) - input_shapes.append(input_data.size()) + if isinstance(input_data, int): + input_types.append(torch.int64) + input_shapes.append(torch.Size([1])) + else: + input_types.append(input_data.type()) + input_shapes.append(input_data.size()) - decoder = TorchFXPythonDecoder(gm, input_shapes=input_shapes, input_types=input_types) + decoder = TorchFXPythonDecoder(gm) im = fe.load(decoder) @@ -118,8 +122,13 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, options } for idx, input_data in enumerate(args): - om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype]) - om.inputs[idx].get_node().set_partial_shape(PartialShape(list(input_data.shape))) + if isinstance(input_data, int): + om.inputs[idx].get_node().set_element_type(dtype_mapping[torch.int64]) + om.inputs[idx].get_node().set_partial_shape(PartialShape(list(torch.Size([1])))) + else: + om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype]) + om.inputs[idx].get_node().set_partial_shape(PartialShape(list(decoder.input_shapes[idx]))) + om.validate_nodes_and_infer_types() config = _get_config(options) @@ -129,4 +138,4 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, options config["CACHE_DIR"] = cache_root compiled = core.compile_model(om, device, config) - return compiled \ No newline at end of file + return compiled diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py index 8d04efaa71ab8a..4f41f7b5a6a9de 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/execute.py @@ -21,7 +21,7 @@ from openvino.frontend.pytorch.torchdynamo.partition import Partitioner from openvino.frontend.pytorch.torchdynamo.compile import openvino_compile from openvino.runtime import Core, Type, PartialShape -from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device +from openvino.frontend.pytorch.torchdynamo.backend_utils import _get_cache_dir, _get_device, _get_aot_autograd from typing import Callable, Optional, Any @@ -40,6 +40,7 @@ ) compiled_cache = {} +req_cache = {} max_openvino_partitions = 0 partitioned_modules = {} @@ -91,14 +92,19 @@ def openvino_execute(gm: GraphModule, *args, executor_parameters=None, partition if use_cache and (partition_id in compiled_cache): compiled = compiled_cache[partition_id] + req = req_cache[partition_id] else: compiled = openvino_compile(gm, *args, model_hash_str=model_hash_str, options=options) compiled_cache[partition_id] = compiled + req = compiled.create_infer_request() + req_cache[partition_id] = req flat_args, _ = tree_flatten(args) - ov_inputs = [a.detach().cpu().numpy() for a in flat_args] + ov_inputs = [] + for arg in flat_args: + ov_inputs.append((arg if isinstance(arg, int) else arg.detach().cpu().numpy())) - res = compiled(ov_inputs) + res = req.infer(ov_inputs, share_inputs=True, share_outputs=True) results1 = [torch.from_numpy(res[out]) for out in compiled.outputs] if len(results1) == 1: @@ -123,7 +129,7 @@ def __call__(self, *args): try: result = openvino_execute(self.gm, *args, executor_parameters=self.executor_parameters, partition_id=self.partition_id, options=self.options) except Exception: - logger.warning("OpenVINO execution failed. Falling back to native PyTorch execution.") + logger.debug("OpenVINO execution failed. Falling back to native PyTorch execution.") self.perm_fallback = True return self.gm(*args) @@ -162,11 +168,12 @@ def openvino_execute_partitioned(gm: GraphModule, *args, executor_parameters=Non model_hash_str = executor_parameters.get("model_hash_str", None) signature = str(id(gm)) - for idx, input_data in enumerate(args): - if isinstance(input_data, torch.Tensor): - signature = signature + "_" + str(idx) + ":" + str(input_data.type())[6:] + ":" + str(input_data.size())[11:-1].replace(" ", "") - else: - signature = signature + "_" + str(idx) + ":" + type(input_data).__name__ + ":val(" + str(input_data) + ")" + if (not _get_aot_autograd(options)): + for idx, input_data in enumerate(args): + if isinstance(input_data, torch.Tensor): + signature = signature + "_" + str(idx) + ":" + str(input_data.type())[6:] + ":" + str(input_data.size())[11:-1].replace(" ", "") + else: + signature = signature + "_" + str(idx) + ":" + type(input_data).__name__ + ":val(" + str(input_data) + ")" if signature not in partitioned_modules: partitioned_modules[signature] = partition_graph(gm, use_python_fusion_cache=use_python_fusion_cache, diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py index 8f2ba4906b46bc..c2d08bd14638df 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py @@ -28,7 +28,12 @@ class OperatorSupport(OperatorSupport): def __init__(self, options): support_dict = { + "_operator.add": None, + "_operator.floordiv": None, "_operator.getitem": None, + "_operator.mul": None, + "_operator.sub": None, + "torch.ops.aten.sym_size.int": None, "torch.ops.aten._adaptive_avg_pool1d.default": None, "torch.ops.aten._adaptive_avg_pool2d.default": None, "torch.ops.aten._adaptive_avg_pool3d.default": None, diff --git a/src/bindings/python/src/pyopenvino/core/common.cpp b/src/bindings/python/src/pyopenvino/core/common.cpp index 5d8a4a73d6b312..9f57b794e2bff6 100644 --- a/src/bindings/python/src/pyopenvino/core/common.cpp +++ b/src/bindings/python/src/pyopenvino/core/common.cpp @@ -37,35 +37,37 @@ py::dtype get_dtype(const ov::element::Type& ov_type) { return ov_type_to_dtype().at(ov_type); } -const std::map& dtype_num_to_ov_type() { - static const std::map dtype_to_ov_type_mapping = { - {23, ov::element::f16}, // float16 - {11, ov::element::f32}, // float32 - {12, ov::element::f64}, // float64 - {1, ov::element::i8}, // int8 - {3, ov::element::i16}, // int16 -#ifdef _WIN32 - {7, ov::element::i32}, // int32 - {9, ov::element::i64}, // int64 -#else - {5, ov::element::i32}, // int32 - {7, ov::element::i64}, // int64 -#endif - {2, ov::element::u8}, // uint8 - {4, ov::element::u16}, // uint16 -#ifdef _WIN32 - {8, ov::element::u32}, // uint32 - {10, ov::element::u64}, // uint64 -#else - {6, ov::element::u32}, // uint32 - {8, ov::element::u64}, // uint64 -#endif - {0, ov::element::boolean}, // bool - {18, ov::element::string}, // bytes_ - {19, ov::element::string}, // str_ - {18, ov::element::string}, // bytes - {19, ov::element::string}, // str +std::map init_num_to_ov_type() { + static const std::map str_to_type_mapping = { + {"float16", ov::element::f16}, + {"float32", ov::element::f32}, + {"float64", ov::element::f64}, + {"int8", ov::element::i8}, + {"int16", ov::element::i16}, + {"int32", ov::element::i32}, + {"int64", ov::element::i64}, + {"uint8", ov::element::u8}, + {"uint16", ov::element::u16}, + {"uint32", ov::element::u32}, + {"uint64", ov::element::u64}, + {"bool", ov::element::boolean}, + {"bytes_", ov::element::string}, + {"str_", ov::element::string}, + {"bytes", ov::element::string}, + {"str", ov::element::string}, }; + + std::map int_to_type_mapping; + + for (const auto& e : str_to_type_mapping) { + int_to_type_mapping[py::dtype(e.first).num()] = e.second; + } + + return int_to_type_mapping; +} + +const std::map& dtype_num_to_ov_type() { + static const std::map dtype_to_ov_type_mapping = init_num_to_ov_type(); return dtype_to_ov_type_mapping; } diff --git a/src/bindings/python/src/pyopenvino/core/common.hpp b/src/bindings/python/src/pyopenvino/core/common.hpp index 374a31e7ece5f7..2311855a8ad34e 100644 --- a/src/bindings/python/src/pyopenvino/core/common.hpp +++ b/src/bindings/python/src/pyopenvino/core/common.hpp @@ -47,6 +47,8 @@ const std::map& ov_type_to_dtype(); py::dtype get_dtype(const ov::element::Type& ov_type); +std::map init_num_to_ov_type(); + const std::map& dtype_num_to_ov_type(); ov::element::Type get_ov_type(const py::array& array); diff --git a/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp b/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp index 2094bbf085bc26..579f5f56114dcf 100644 --- a/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp +++ b/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp @@ -50,6 +50,9 @@ TEST_F(TransformationTestsF, RMSNormFusionTest1) { model_ref = std::make_shared(ov::NodeVector{rms}, ov::ParameterVector{input}); } + comparator.enable(FunctionsComparator::CmpValues::ACCURACY); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); } TEST_F(TransformationTestsF, RMSNormFusionTest2) { diff --git a/src/core/include/openvino/op/roi_align_rotated.hpp b/src/core/include/openvino/op/roi_align_rotated.hpp index 500988e8745971..5e7bf95fa68bcc 100644 --- a/src/core/include/openvino/op/roi_align_rotated.hpp +++ b/src/core/include/openvino/op/roi_align_rotated.hpp @@ -8,13 +8,13 @@ namespace ov { namespace op { -namespace v14 { +namespace v15 { /// \brief ROIAlignRotated operation. /// /// \ingroup ov_ops_cpp_api class OPENVINO_API ROIAlignRotated : public util::ROIAlignBase { public: - OPENVINO_OP("ROIAlignRotated", "opset14", util::ROIAlignBase); + OPENVINO_OP("ROIAlignRotated", "opset15", util::ROIAlignBase); ROIAlignRotated() = default; /// \brief Constructs a ROIAlignRotated operation. @@ -57,6 +57,6 @@ class OPENVINO_API ROIAlignRotated : public util::ROIAlignBase { private: bool m_clockwise_mode; }; -} // namespace v14 +} // namespace v15 } // namespace op } // namespace ov diff --git a/src/core/include/openvino/opsets/opset14_tbl.hpp b/src/core/include/openvino/opsets/opset14_tbl.hpp index f96544d65edf81..1e1c520e475852 100644 --- a/src/core/include/openvino/opsets/opset14_tbl.hpp +++ b/src/core/include/openvino/opsets/opset14_tbl.hpp @@ -221,4 +221,3 @@ _OPENVINO_OP_REG(FakeConvert, ov::op::v13) // New operations added in opset14 _OPENVINO_OP_REG(ConvertPromoteTypes, ov::op::v14) _OPENVINO_OP_REG(Inverse, ov::op::v14) -_OPENVINO_OP_REG(ROIAlignRotated, ov::op::v14) diff --git a/src/core/include/openvino/opsets/opset15_tbl.hpp b/src/core/include/openvino/opsets/opset15_tbl.hpp index 33e433e0a30afa..50c8603cf2046c 100644 --- a/src/core/include/openvino/opsets/opset15_tbl.hpp +++ b/src/core/include/openvino/opsets/opset15_tbl.hpp @@ -13,6 +13,7 @@ _OPENVINO_OP_REG(Convert, ov::op::v0) _OPENVINO_OP_REG(ShapeOf, ov::op::v3) // New operations added in opset15 +_OPENVINO_OP_REG(ROIAlignRotated, ov::op::v15) _OPENVINO_OP_REG(ScatterNDUpdate, ov::op::v15) _OPENVINO_OP_REG(EmbeddingBagPacked, ov::op::v15) _OPENVINO_OP_REG(EmbeddingBagOffsets, ov::op::v15) diff --git a/src/core/reference/include/openvino/reference/rms_norm.hpp b/src/core/reference/include/openvino/reference/rms_norm.hpp index 15b327596bf643..b235813a264a44 100644 --- a/src/core/reference/include/openvino/reference/rms_norm.hpp +++ b/src/core/reference/include/openvino/reference/rms_norm.hpp @@ -8,6 +8,7 @@ #include #include "openvino/reference/add.hpp" +#include "openvino/reference/convert.hpp" #include "openvino/reference/divide.hpp" #include "openvino/reference/multiply.hpp" #include "openvino/reference/power.hpp" @@ -72,5 +73,33 @@ void rms_norm(const T* in, rms_norm(in, axes, out, in_shape, eps); multiply(out, scale, out, in_shape, scale_shape, op::AutoBroadcastType::NUMPY); } + +/** + * @brief Reference implementation of RMS operator with output type conversion + * + * Math Formula: Convert((x / Sqrt(ReduceMean(x^2, axes) + eps)) * scale), T_OUT) + * + * @param in Input pointer to data + * @param axes Axes for reduce mean calculation + * @param out Output pointer to results + * @param in_shape Shape of the input Tensor + * @param eps Epsilon for not dividing by zero while normalizing the value + * @param scale_shape Shape of the scale Tensor + * @param scale Input pointer to scale + * + */ +template +void rms_norm_mul_convert_out(const T_IN* in, + const AxisSet& axes, + T_OUT* out, + const Shape& in_shape, + double eps, + const Shape& scale_shape, + const T_IN* scale) { + std::vector tmp_out(shape_size(in_shape)); + rms_norm(in, axes, tmp_out.data(), in_shape, eps, scale_shape, scale); + convert(tmp_out.data(), out, tmp_out.size()); +} + } // namespace reference } // namespace ov diff --git a/src/core/shape_inference/include/roi_align_rotated_shape_inference.hpp b/src/core/shape_inference/include/roi_align_rotated_shape_inference.hpp index 5faeaefa19ad21..0cdcf95dde0b54 100644 --- a/src/core/shape_inference/include/roi_align_rotated_shape_inference.hpp +++ b/src/core/shape_inference/include/roi_align_rotated_shape_inference.hpp @@ -8,12 +8,12 @@ namespace ov { namespace op { -namespace v14 { +namespace v15 { class ROIAlignRotated; template > std::vector shape_infer(const ROIAlignRotated* op, const std::vector& input_shapes) { return roi_align::shape_infer(op, input_shapes); } -} // namespace v14 +} // namespace v15 } // namespace op } // namespace ov diff --git a/src/core/src/op/roi_align_rotated.cpp b/src/core/src/op/roi_align_rotated.cpp index 4eec291bc468d7..0bdec26fa15476 100644 --- a/src/core/src/op/roi_align_rotated.cpp +++ b/src/core/src/op/roi_align_rotated.cpp @@ -8,7 +8,7 @@ namespace ov { namespace op { -namespace v14 { +namespace v15 { ROIAlignRotated::ROIAlignRotated(const Output& input, const Output& rois, const Output& batch_indices, @@ -49,6 +49,6 @@ std::shared_ptr ROIAlignRotated::clone_with_new_inputs(const OutputVector& get_spatial_scale(), get_clockwise_mode()); } -} // namespace v14 +} // namespace v15 } // namespace op } // namespace ov diff --git a/src/core/tests/opset.cpp b/src/core/tests/opset.cpp index e98c8644236592..c63b4759287a2a 100644 --- a/src/core/tests/opset.cpp +++ b/src/core/tests/opset.cpp @@ -74,8 +74,8 @@ INSTANTIATE_TEST_SUITE_P(opset, OpsetTestParams{ov::get_opset11, 177}, OpsetTestParams{ov::get_opset12, 178}, OpsetTestParams{ov::get_opset13, 186}, - OpsetTestParams{ov::get_opset14, 189}, - OpsetTestParams{ov::get_opset15, 7}), + OpsetTestParams{ov::get_opset14, 188}, + OpsetTestParams{ov::get_opset15, 8}), OpsetTestNameGenerator{}); class MyOpOld : public ov::op::Op { diff --git a/src/core/tests/type_prop/roi_align.cpp b/src/core/tests/type_prop/roi_align.cpp index 964de695315f79..8bf7f734a07616 100644 --- a/src/core/tests/type_prop/roi_align.cpp +++ b/src/core/tests/type_prop/roi_align.cpp @@ -32,7 +32,7 @@ class ROIAlignTest : public testing::Test { ov::Dimension::value_type GetROISecondDimSizeForOp() const { // Those magic numbers comes from definition of ROIAlign ops. - if (std::is_same::value) + if (std::is_same::value) return 5; return 4; } @@ -281,5 +281,5 @@ REGISTER_TYPED_TEST_SUITE_P(ROIAlignTest, data_and_rois_not_same_type, batch_indicies_not_integer); -typedef Types ROIAlignTypes; +typedef Types ROIAlignTypes; INSTANTIATE_TYPED_TEST_SUITE_P(type_prop, ROIAlignTest, ROIAlignTypes); diff --git a/src/frontends/onnx/frontend/src/op/mmdeploy_roi_align_rotated.cpp b/src/frontends/onnx/frontend/src/op/mmdeploy_roi_align_rotated.cpp index 588cf07646bf7d..565c73674c75fd 100644 --- a/src/frontends/onnx/frontend/src/op/mmdeploy_roi_align_rotated.cpp +++ b/src/frontends/onnx/frontend/src/op/mmdeploy_roi_align_rotated.cpp @@ -56,7 +56,7 @@ ov::OutputVector mmdeploy_roi_align_rotated(const ov::frontend::onnx::Node& node const auto spatial_scale = node.get_attribute_value("spatial_scale", 1.0f); const auto clockwise = static_cast(node.get_attribute_value("clockwise", 0)); - return {std::make_shared(data, + return {std::make_shared(data, rois, rois_batch_idx, static_cast(pooled_h), diff --git a/src/frontends/onnx/frontend/src/op/reduce.cpp b/src/frontends/onnx/frontend/src/op/reduce.cpp index 9ba7e6c36fa9e8..ed4aaa5d13dc45 100644 --- a/src/frontends/onnx/frontend/src/op/reduce.cpp +++ b/src/frontends/onnx/frontend/src/op/reduce.cpp @@ -136,6 +136,14 @@ std::shared_ptr make_ov_reduction_op(const Node& node, return set_1::identity(node).at(0).get_node_shared_ptr(); } } + +std::shared_ptr onnx_reduce_sum_square(const ov::frontend::onnx::Node& node, + const std::set& supported_types, + const bool axes_as_attr = true) { + const auto input = ov::Output{node.get_ov_inputs().at(0)}; + const auto square_node = std::make_shared(input, input); + return make_ov_reduction_op(node, square_node, supported_types, axes_as_attr); +} } // namespace namespace set_1 { @@ -180,9 +188,7 @@ ov::OutputVector reduce_sum(const ov::frontend::onnx::Node& node) { } ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node) { - const auto input = ov::Output{node.get_ov_inputs().at(0)}; - const auto square_node = std::make_shared(input, input); - return {make_ov_reduction_op(node, square_node, supported_types_v1)}; + return {onnx_reduce_sum_square(node, supported_types_v1)}; } } // namespace set_1 @@ -199,9 +205,11 @@ namespace set_13 { ov::OutputVector reduce_sum(const ov::frontend::onnx::Node& node) { return {make_ov_reduction_op(node, node.get_ov_inputs().at(0), supported_types_v2, false)}; } + ov::OutputVector reduce_l2(const Node& node) { return {make_ov_reduction_op(node, node.get_ov_inputs().at(0), supported_types_v2)}; } + ov::OutputVector reduce_max(const ov::frontend::onnx::Node& node) { return {make_ov_reduction_op(node, node.get_ov_inputs().at(0), supported_types_v3)}; } @@ -209,6 +217,10 @@ ov::OutputVector reduce_max(const ov::frontend::onnx::Node& node) { ov::OutputVector reduce_min(const ov::frontend::onnx::Node& node) { return {make_ov_reduction_op(node, node.get_ov_inputs().at(0), supported_types_v3)}; } + +ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node) { + return {onnx_reduce_sum_square(node, supported_types_v2)}; +} } // namespace set_13 namespace set_18 { @@ -228,6 +240,10 @@ ov::OutputVector reduce_log_sum(const ov::frontend::onnx::Node& node) { make_ov_reduction_op(node, node.get_ov_inputs().at(0), supported_types_v2, false); return {std::make_shared(sum_node)}; } + +ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node) { + return {onnx_reduce_sum_square(node, supported_types_v2, false)}; +} } // namespace set_18 namespace set_20 { diff --git a/src/frontends/onnx/frontend/src/op/reduce.hpp b/src/frontends/onnx/frontend/src/op/reduce.hpp index f361c89fd2d960..740a6127fc3303 100644 --- a/src/frontends/onnx/frontend/src/op/reduce.hpp +++ b/src/frontends/onnx/frontend/src/op/reduce.hpp @@ -79,6 +79,12 @@ ov::OutputVector reduce_sum(const ov::frontend::onnx::Node& node); namespace set_1 { ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node); } // namespace set_1 +namespace set_13 { +ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node); +} // namespace set_13 +namespace set_18 { +ov::OutputVector reduce_sum_square(const ov::frontend::onnx::Node& node); +} // namespace set_18 } // namespace op } // namespace onnx diff --git a/src/frontends/onnx/frontend/src/ops_bridge.cpp b/src/frontends/onnx/frontend/src/ops_bridge.cpp index 02a79979ecaa73..5ab8a792c1fefa 100644 --- a/src/frontends/onnx/frontend/src/ops_bridge.cpp +++ b/src/frontends/onnx/frontend/src/ops_bridge.cpp @@ -503,6 +503,8 @@ OperatorsBridge::OperatorsBridge() { REGISTER_OPERATOR("ReduceSum", 1, reduce_sum); REGISTER_OPERATOR("ReduceSum", 13, reduce_sum); REGISTER_OPERATOR("ReduceSumSquare", 1, reduce_sum_square); + REGISTER_OPERATOR("ReduceSumSquare", 13, reduce_sum_square); + REGISTER_OPERATOR("ReduceSumSquare", 18, reduce_sum_square); REGISTER_OPERATOR("Relu", 1, relu); REGISTER_OPERATOR("Reshape", 1, reshape); REGISTER_OPERATOR("Resize", 1, resize); diff --git a/src/frontends/onnx/tests/models/reduce_sum_square_13.prototxt b/src/frontends/onnx/tests/models/reduce_sum_square_13.prototxt new file mode 100644 index 00000000000000..590fa0c7414504 --- /dev/null +++ b/src/frontends/onnx/tests/models/reduce_sum_square_13.prototxt @@ -0,0 +1,48 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + output: "B" + op_type: "ReduceSumSquare" + } + name: "compute_graph" + input { + name: "A" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 1 + } + dim { + dim_value: 4 + } + dim { + dim_value: 4 + } + } + } + } + } + output { + name: "B" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + } + } + } + } +} +opset_import { + version: 13 +} diff --git a/src/frontends/onnx/tests/models/reduce_sum_square_18.prototxt b/src/frontends/onnx/tests/models/reduce_sum_square_18.prototxt new file mode 100644 index 00000000000000..4b9e0f944565eb --- /dev/null +++ b/src/frontends/onnx/tests/models/reduce_sum_square_18.prototxt @@ -0,0 +1,48 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + output: "B" + op_type: "ReduceSumSquare" + } + name: "compute_graph" + input { + name: "A" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 1 + } + dim { + dim_value: 4 + } + dim { + dim_value: 4 + } + } + } + } + } + output { + name: "B" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + } + } + } + } +} +opset_import { + version: 18 +} diff --git a/src/frontends/onnx/tests/onnx_import.in.cpp b/src/frontends/onnx/tests/onnx_import.in.cpp index 3aa45042d6276b..59a53c0016eb1a 100644 --- a/src/frontends/onnx/tests/onnx_import.in.cpp +++ b/src/frontends/onnx/tests/onnx_import.in.cpp @@ -1257,6 +1257,38 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_model_reduce_sum_square) { test_case.run(); } +OPENVINO_TEST(${BACKEND_NAME}, onnx_model_reduce_sum_square_13) { + auto model = convert_model("reduce_sum_square_13.onnx"); + + // input data shape (1, 1, 4, 4) + Inputs inputs{ + ov::test::NDArray({{{{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}}}).get_vector()}; + + // output data shape (1,) + auto expected_output = ov::test::NDArray({{{{16}}}}).get_vector(); + + auto test_case = ov::test::TestCase(model, s_device); + test_case.add_multiple_inputs(inputs); + test_case.add_expected_output(expected_output); + test_case.run(); +} + +OPENVINO_TEST(${BACKEND_NAME}, onnx_model_reduce_sum_square_18) { + auto model = convert_model("reduce_sum_square_18.onnx"); + + // input data shape (1, 1, 4, 4) + Inputs inputs{ + ov::test::NDArray({{{{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}}}).get_vector()}; + + // output data shape (1,) + auto expected_output = ov::test::NDArray({{{{16}}}}).get_vector(); + + auto test_case = ov::test::TestCase(model, s_device); + test_case.add_multiple_inputs(inputs); + test_case.add_expected_output(expected_output); + test_case.run(); +} + OPENVINO_TEST(${BACKEND_NAME}, onnx_model_reduce_sum_13_axes_as_constant) { auto model = convert_model("reduce_sum_13_axes_as_constant.onnx"); diff --git a/src/frontends/onnx/tests/tests_python/test_backend.py b/src/frontends/onnx/tests/tests_python/test_backend.py index 75ae10fb9e55c4..91bd1701ca34b5 100644 --- a/src/frontends/onnx/tests/tests_python/test_backend.py +++ b/src/frontends/onnx/tests/tests_python/test_backend.py @@ -488,12 +488,6 @@ def expect_fail(test_case_path, xfail): # type: (str) -> None "OnnxBackendNodeModelTest.test_reduce_prod_keepdims_random_cpu", "OnnxBackendNodeModelTest.test_reduce_prod_negative_axes_keepdims_example_cpu", "OnnxBackendNodeModelTest.test_reduce_prod_negative_axes_keepdims_random_cpu", - "OnnxBackendNodeModelTest.test_reduce_sum_square_do_not_keepdims_example_cpu", - "OnnxBackendNodeModelTest.test_reduce_sum_square_do_not_keepdims_random_cpu", - "OnnxBackendNodeModelTest.test_reduce_sum_square_keepdims_example_cpu", - "OnnxBackendNodeModelTest.test_reduce_sum_square_keepdims_random_cpu", - "OnnxBackendNodeModelTest.test_reduce_sum_square_negative_axes_keepdims_example_cpu", - "OnnxBackendNodeModelTest.test_reduce_sum_square_negative_axes_keepdims_random_cpu", ), ( xfail_issue_99969, @@ -685,7 +679,6 @@ def expect_fail(test_case_path, xfail): # type: (str) -> None "OnnxBackendNodeModelTest.test_reduce_l1_empty_set_cpu", "OnnxBackendNodeModelTest.test_reduce_log_sum_exp_empty_set_cpu", "OnnxBackendNodeModelTest.test_reduce_prod_empty_set_cpu", - "OnnxBackendNodeModelTest.test_reduce_sum_square_empty_set_cpu", ), ( skip_misalignment, diff --git a/src/frontends/pytorch/src/op/expand.cpp b/src/frontends/pytorch/src/op/expand.cpp index 5005e34ebb7384..8e9ce327e647d5 100644 --- a/src/frontends/pytorch/src/op/expand.cpp +++ b/src/frontends/pytorch/src/op/expand.cpp @@ -42,24 +42,30 @@ OutputVector translate_expand_as(const NodeContext& context) { }; OutputVector translate_expand_fx(const NodeContext& context) { - // aten::expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a) - num_inputs_check(context, 2, 3); + auto num_inputs = context.get_input_size(); + num_inputs_check(context, 2, num_inputs); auto x = context.get_input(0); - // TODO: This is a temporary solution to optimize out Broadcast if the input and - // output shapes are same. This should be removed after a proper optimization is - // implemented. - auto sizes_const = context.const_input(1); - if (x.get_partial_shape().is_static() && x.get_shape() == sizes_const) { - return {x}; - } + std::vector shape_vec; auto sizes = context.get_input(1); - // TODO: figure out what implicit means - PYTORCH_OP_CONVERSION_CHECK(context.input_is_none(2) || context.const_input(2) == false, - "Unexpected value of implicit for expand operation"); + if (num_inputs != 2) { + for (size_t i = 1; i < num_inputs; i++) { + auto a = context.get_input_from_visible_context(i).get_node_shared_ptr(); + auto shape_input = context.get_input(static_cast(i)); + if (std::dynamic_pointer_cast(a) || + shape_input.get_partial_shape().rank().is_dynamic() || + shape_input.get_partial_shape().rank().get_length() == 0) { + shape_vec.push_back(-1); + } else { + auto val = context.const_input(i); + shape_vec.push_back(val); + } + } + sizes = ov::op::v0::Constant::create(element::i32, Shape{num_inputs - 1}, shape_vec); + } return base_expand(context, x, sizes); }; } // namespace op } // namespace pytorch } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/frontends/pytorch/src/op/reshape.cpp b/src/frontends/pytorch/src/op/reshape.cpp index 97306ba1f6d4e7..edea4c7aefb44a 100644 --- a/src/frontends/pytorch/src/op/reshape.cpp +++ b/src/frontends/pytorch/src/op/reshape.cpp @@ -5,6 +5,9 @@ #include "openvino/op/reshape.hpp" #include "openvino/frontend/pytorch/node_context.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/squeeze.hpp" +#include "openvino/op/unsqueeze.hpp" #include "utils.hpp" namespace ov { @@ -22,6 +25,77 @@ OutputVector translate_reshape(const NodeContext& context) { return {context.mark_node(reshape)}; }; +OutputVector translate_reshape_fx(const NodeContext& context) { + // Schema: aten.view.default(Tensor input, int[] shape) -> Tensor + auto num_inputs = context.get_input_size(); + num_inputs_check(context, 2, num_inputs); + std::vector shape_vec; + if (context.get_input_type(1).is()) { + int num_dyn_dims = 0; + for (size_t i = 1; i < num_inputs; i++) { + auto shape_input = context.get_input(static_cast(i)); + if (context.get_input_type(i).as().element_type.is()) { + auto const_val = context.const_input(i); + shape_vec.push_back(const_val); + } else { + // Set dimension to be dynamic if it's coming from an argument or another node + shape_vec.push_back(-1); + num_dyn_dims++; + } + } + // We cannot use multiple -1s if there are more than 1 dynamic dimensions + if (num_dyn_dims >= 2) { + auto inp_shape = context.get_input(0).get_partial_shape(); + // If there are multiple dynamic dymensions, we cannot support inputs with dynamic rank + if (inp_shape.rank().is_static()) { + auto zero = context.mark_node(ov::op::v0::Constant::create(element::i32, Shape{1}, {0})); + if (inp_shape.size() >= 3 && inp_shape.size() + 1 == shape_vec.size() && shape_vec[0] == 1 && + inp_shape[0] == shape_vec[1]) { + // [N, ...] -> [1, N, ...] Can be translated to Unsqueeze + auto unsqueeze = + context.mark_node(std::make_shared(context.get_input(0), zero)); + return {unsqueeze}; + } else if (shape_vec.size() >= 3 && shape_vec.size() + 1 == inp_shape.size() && inp_shape[0] == 1 && + inp_shape[1] == shape_vec[0]) { + // [1, N, ...] -> [N, ...] Can be translated to Squeeze + auto squeeze = context.mark_node(std::make_shared(context.get_input(0), zero)); + return {squeeze}; + } else if (inp_shape.size() == shape_vec.size()) { + // If the input rank is equal to output rank, we can use 0s in place of dynamic dimensions + for (size_t k = 0; k < shape_vec.size(); k++) { + if (shape_vec[k] == -1) + shape_vec[k] = 0; + } + } else { + FRONT_END_GENERAL_CHECK( + false, + "Cannot support reshape with multiple dynamic dimensions for unequal ranks"); + } + } else { + FRONT_END_GENERAL_CHECK( + false, + "Cannot support reshape with multiple dynamic dimensions for dynamic input ranks"); + } + } + + auto shape_const = ov::op::v0::Constant::create(element::i32, Shape{num_inputs - 1}, shape_vec); + auto reshape = std::make_shared(context.get_input(0), shape_const, true); + return {context.mark_node(reshape)}; + } else { + auto shape_input = context.get_input(1); + if (shape_input.get_partial_shape().rank().is_dynamic() || + shape_input.get_partial_shape().rank().get_length() == 0) { + shape_vec.push_back(0); + auto shape_const = ov::op::v0::Constant::create(element::i32, Shape{1}, shape_vec); + auto result = + context.mark_node(std::make_shared(context.get_input(0), shape_const, true)); + return {result}; + } + auto reshape = std::make_shared(context.get_input(0), context.get_input(1), true); + return {context.mark_node(reshape)}; + } +}; + } // namespace op } // namespace pytorch } // namespace frontend diff --git a/src/frontends/pytorch/src/op/slice.cpp b/src/frontends/pytorch/src/op/slice.cpp index e718183ae13d7e..6994979a1e77e9 100644 --- a/src/frontends/pytorch/src/op/slice.cpp +++ b/src/frontends/pytorch/src/op/slice.cpp @@ -18,7 +18,9 @@ namespace op { using namespace ov::op; -OutputVector translate_slice_common(const NodeContext& context, const size_t num_inputs) { +OutputVector translate_slice_common(const NodeContext& context, + const size_t num_inputs, + const bool stop_dynamic_rank_unsqueeze = true) { // aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> (t[]) // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> (Tensor(a)) ov::Output dim; @@ -56,7 +58,9 @@ OutputVector translate_slice_common(const NodeContext& context, const size_t num ov::Output end; if (!context.input_is_none(end_idx)) { end = context.get_input(end_idx); - if (end.get_partial_shape().rank().is_dynamic() || end.get_partial_shape().rank().get_length() == 0) { + // TODO: Find a better way to solve the issue with dynamic ranks for "end" + if ((stop_dynamic_rank_unsqueeze && end.get_partial_shape().rank().is_dynamic()) || + (!(end.get_partial_shape().rank().is_dynamic()) && end.get_partial_shape().rank().get_length() == 0)) { end = context.mark_node(std::make_shared(end, axis_0)); } } else { @@ -81,10 +85,10 @@ OutputVector translate_slice(const NodeContext& context) { OutputVector translate_slice_fx(const NodeContext& context) { // slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) // FX version of slice have the inputs in the same order as it has 5 inputs, even if it has less than 5 inputs - return translate_slice_common(context, 5); + return translate_slice_common(context, 5, false); }; } // namespace op } // namespace pytorch } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index af09480ea4282d..ef53c75d0fe369 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -284,6 +284,7 @@ OP_CONVERTER(translate_new_zeros_fx); OP_CONVERTER(translate_ones_fx); OP_CONVERTER(translate_ones_like_fx); OP_CONVERTER(translate_reflection_pad_nd_fx); +OP_CONVERTER(translate_reshape_fx); OP_CONVERTER(translate_rsub_fx); OP_CONVERTER(translate_scalar_tensor_fx); OP_CONVERTER(translate_scaled_dot_product_attention_fx); @@ -733,7 +734,11 @@ const std::map get_supported_ops_ts() { const std::map get_supported_ops_fx() { return { + {"", op::translate_add}, + {"", op::translate_floor_divide}, {"", op::translate_getitem}, // TODO: Check if there is any other way to handle this + {"", op::translate_mul}, + {"", op::translate_sub}, {"aten._adaptive_avg_pool1d.default", op::translate_adaptive_avg_pool1d}, {"aten._adaptive_avg_pool2d.default", op::translate_adaptive_avg_pool2d}, {"aten._adaptive_avg_pool3d.default", op::translate_adaptive_avg_pool3d}, @@ -949,6 +954,7 @@ const std::map get_supported_ops_fx() { {"aten.sub.Tensor", op::translate_sub_fx}, {"aten.sum.default", op::translate_sum_fx}, {"aten.sum.dim_IntList", op::translate_sum_fx}, + {"aten.sym_size.int", op::translate_size}, {"aten.t.default", op::translate_t}, {"aten.tan.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, {"aten.tanh.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, @@ -961,7 +967,7 @@ const std::map get_supported_ops_fx() { {"aten.upsample_nearest2d.default", op::translate_upsample_nearest2d}, {"aten.var.correction", op::translate_var_fx}, {"aten.var_mean.correction", op::translate_var_mean_fx}, - {"aten.view.default", op::translate_reshape}, + {"aten.view.default", op::translate_reshape_fx}, {"aten.where.self", op::translate_where}, {"aten.zeros.default", op::translate_zeros_fx}, {"aten.zeros.names", op::translate_zeros_fx}, diff --git a/src/inference/include/openvino/runtime/intel_gpu/properties.hpp b/src/inference/include/openvino/runtime/intel_gpu/properties.hpp index 7f661d5b67a74a..185195e288805c 100644 --- a/src/inference/include/openvino/runtime/intel_gpu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_gpu/properties.hpp @@ -115,6 +115,14 @@ static constexpr Property host_task_priority{"GPU_HOST_TASK_ * @ingroup ov_runtime_ocl_gpu_prop_cpp_api */ static constexpr Property available_device_mem{"AVAILABLE_DEVICE_MEM_SIZE"}; + +/** + * @brief Turning on this key disables SDPA operation decomposition and keeps SDPA operation in the graph. + * Enabling SDPA optimization may provide performance improvements and memory usage reduction. + * This key serves as a recommendation and may be ignored in known sub-optimal cases. + * @ingroup ov_runtime_ocl_gpu_prop_cpp_api + */ +static constexpr Property enable_sdpa_optimization{"GPU_ENABLE_SDPA_OPTIMIZATION"}; } // namespace hint /** diff --git a/src/plugins/auto/src/auto_schedule.cpp b/src/plugins/auto/src/auto_schedule.cpp index 0c248a2b491054..3de0cc9f00b871 100644 --- a/src/plugins/auto/src/auto_schedule.cpp +++ b/src/plugins/auto/src/auto_schedule.cpp @@ -133,7 +133,6 @@ void AutoSchedule::init() { if (m_compile_context[ACTUALDEVICE].m_is_enabled) { LOG_INFO_TAG("select device:%s", m_compile_context[ACTUALDEVICE].m_device_info.device_name.c_str()); bool is_actual_cpu = m_compile_context[ACTUALDEVICE].m_device_info.device_name.find("CPU") != std::string::npos; - bool is_actual_gpu = m_compile_context[ACTUALDEVICE].m_device_info.device_name.find("GPU") != std::string::npos; // if Actual device is CPU or perf_hint is cumulative, disabled m_compile_context[CPU], only use // m_compile_context[ACTUALDEVICE] if (is_actual_cpu || !m_context->m_startup_fallback) { @@ -148,29 +147,11 @@ void AutoSchedule::init() { // limit the threads num for compiling auto device = m_compile_context[ACTUALDEVICE].m_device_info.device_name; auto& device_config = m_compile_context[ACTUALDEVICE].m_device_info.config; - if (is_actual_gpu) { - int max_threads = 0; - try { - max_threads = m_context->m_ov_core->get_property(device, ov::compilation_num_threads); - } catch (const ov::Exception&) { - LOG_DEBUG_TAG("cannot get MAX_NUM_THREADS from GPU"); - } - if (max_threads == static_cast(std::thread::hardware_concurrency())) { - int thread_num = max_threads / 2; - m_compile_context[ACTUALDEVICE].m_device_info.config.insert( - ov::compilation_num_threads(thread_num)); - LOG_DEBUG_TAG("gpu streams number for compiling: %d", thread_num); - } else { - // user set the compiling threads num - // use the user's val anyway - LOG_DEBUG_TAG("user defined compiling threads: %d", max_threads); - } - } std::string cache_dir = device_config.count(ov::cache_dir.name()) ? device_config[ov::cache_dir.name()].as() : m_context->m_ov_core->get_property("", ov::cache_dir); - if (!m_context->m_is_set_startup_fallback && !cache_dir.empty()) { + if (m_context->m_startup_fallback && !cache_dir.empty()) { const auto properties = m_context->m_ov_core->create_compile_config(ov::DeviceIDParser(device).get_device_name(), device_config); @@ -323,15 +304,20 @@ void AutoSchedule::try_to_compile_model(AutoCompileContext& context, const std:: device_config.find(ov::compilation_num_threads.name()) != device_config.end()); if (cur_dev_is_gpu && m_compile_context[CPU].m_is_enabled && !is_already_set_gpu) { device_config.insert(ov::intel_gpu::hint::host_task_priority(ov::hint::Priority::HIGH)); - auto proc_type_table = get_org_proc_type_table(); - int compilation_num_threads = proc_type_table[0][MAIN_CORE_PROC] != 0 - ? proc_type_table[0][MAIN_CORE_PROC] - : proc_type_table[0][EFFICIENT_CORE_PROC]; - if (device_config.insert(ov::compilation_num_threads(compilation_num_threads)).second) - LOG_DEBUG_TAG("gpu streams number for compiling: %d", compilation_num_threads); - else - LOG_DEBUG_TAG("user defined compiling threads: %d", - device_config[ov::compilation_num_threads.name()].as()); + int max_threads = 0; + try { + m_context->m_ov_core->get_property(device, ov::compilation_num_threads); + auto proc_type_table = get_org_proc_type_table(); + max_threads = proc_type_table[0][MAIN_CORE_PROC] != 0 ? proc_type_table[0][MAIN_CORE_PROC] + : proc_type_table[0][EFFICIENT_CORE_PROC]; + if (device_config.insert(ov::compilation_num_threads(max_threads)).second) + LOG_DEBUG_TAG("gpu streams number for compiling: %d", max_threads); + else + LOG_DEBUG_TAG("user defined compiling threads: %d", + device_config[ov::compilation_num_threads.name()].as()); + } catch (const ov::Exception&) { + LOG_DEBUG_TAG("cannot get MAX_NUM_THREADS from GPU"); + } } } try { diff --git a/src/plugins/auto/src/common.hpp b/src/plugins/auto/src/common.hpp index 63fb8753e4fff2..28567eb23392c4 100644 --- a/src/plugins/auto/src/common.hpp +++ b/src/plugins/auto/src/common.hpp @@ -207,7 +207,6 @@ class ScheduleContext : public std::enable_shared_from_this { bool m_need_perf_counters; bool m_batching_disabled = false; bool m_startup_fallback = true; - bool m_is_set_startup_fallback = false; bool m_runtime_fallback = true; bool m_bind_buffer = false; std::shared_ptr m_model; diff --git a/src/plugins/auto/src/plugin.cpp b/src/plugins/auto/src/plugin.cpp index 9d8174252d21c9..06b3b7dbc947e4 100644 --- a/src/plugins/auto/src/plugin.cpp +++ b/src/plugins/auto/src/plugin.cpp @@ -436,7 +436,6 @@ std::shared_ptr Plugin::compile_model_impl(const std::string OPENVINO_ASSERT(auto_s_context->m_ov_core); auto_s_context->m_log_tag = get_device_name(); auto_s_context->m_model_precision = model_precision; - auto_s_context->m_is_set_startup_fallback = load_config.is_set_by_user(ov::intel_auto::enable_startup_fallback); auto_s_context->m_startup_fallback = load_config.get_property(ov::intel_auto::enable_startup_fallback); auto_s_context->m_runtime_fallback = load_config.get_property(ov::intel_auto::enable_runtime_fallback); auto_s_context->m_bind_buffer = load_config.get_property(ov::intel_auto::device_bind_buffer); diff --git a/src/plugins/auto/tests/functional/behavior/caching_test.cpp b/src/plugins/auto/tests/functional/behavior/caching_test.cpp index 1b606470fa2f53..1b2df23f9c0d1c 100644 --- a/src/plugins/auto/tests/functional/behavior/caching_test.cpp +++ b/src/plugins/auto/tests/functional/behavior/caching_test.cpp @@ -32,7 +32,7 @@ TEST_F(AutoFuncTests, compiled_with_cache_enabled) { core.set_property(ov::cache_dir("")); } -TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating) { +TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating_default_startup_fallback) { core.set_property(ov::cache_dir(cache_path)); core.set_property("MOCK_GPU", ov::device::id("test")); // device id for cache property distinguish with MOCK_CPU { @@ -74,6 +74,49 @@ TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_acceler core.set_property(ov::cache_dir("")); } +TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating_set_startup_fallback) { + core.set_property(ov::cache_dir(cache_path)); + core.set_property("MOCK_GPU", ov::device::id("test")); // device id for cache property distinguish with MOCK_CPU + { + auto compiled_model = core.compile_model(model_cannot_batch, + "AUTO", + {ov::device::priorities("MOCK_GPU", "MOCK_CPU"), + ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)}); + } + // No cached model for actual device + // will cache model for both actual device and CPU plugin + ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 2); + ov::test::utils::removeFilesWithExt(cache_path, "blob"); + { + auto compiled_model = core.compile_model( + model_cannot_batch, + "AUTO", + {ov::device::priorities("MOCK_GPU"), ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)}); + } + { + auto compiled_model = core.compile_model(model_cannot_batch, + "AUTO", + {ov::device::priorities("MOCK_GPU", "MOCK_CPU"), + ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), + ov::intel_auto::enable_startup_fallback(true)}); + } + // cached model exists for actual device + // will reuse cached model for actual device without CPU accelerating(No cached model for CPU) + ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 1); + core.set_property("MOCK_GPU", ov::device::id("test_regenerate")); + { + auto compiled_model = core.compile_model(model_cannot_batch, + "AUTO", + {ov::device::priorities("MOCK_GPU", "MOCK_CPU"), + ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), + ov::intel_auto::enable_startup_fallback(false)}); + } + // model hash id changed for actual device + // will cache 2 models for actual device and no cached model for CPU + ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 2); + core.set_property(ov::cache_dir("")); +} + TEST_F(AutoFuncTests, compiled_with_cache_enabled_batch_enabled) { #ifdef ENABLE_AUTO_BATCH core.set_property(ov::cache_dir(cache_path)); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp index f7afc9641bbdce..bd05801c139dc8 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp @@ -82,6 +82,11 @@ static constexpr size_t vec_len_f32_avx2 = vec_len_avx2 / sizeof(float); auto vec_f16 = _mm256_loadu_si256(reinterpret_cast(a)); return _mm512_cvtph_ps(vec_f16); } + inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) { + auto mask = (1 << count) - 1; + auto f16_vec = _mm256_maskz_loadu_epi16(mask, a); + return _mm512_cvtph_ps(f16_vec); + } inline void mm512_uni_storeu_ps(ov::float16* addr, __m512 v) { __m256i vec_f16 = _mm512_cvtps_ph(v, 0); _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), vec_f16); @@ -149,6 +154,11 @@ static constexpr size_t vec_len_f32_avx2 = vec_len_avx2 / sizeof(float); auto o = _mm256_cvtph_ps(vec_f16); return o; } + inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) { + ov::float16 tmp_values[8] = {0}; + std::memcpy(tmp_values, a, count * sizeof(ov::float16)); + return mm256_uni_loadu_ps(tmp_values); + } inline void mm256_uni_storeu_ps(ov::float16* a, __m256 v) { __m128i vec_f16 = _mm256_cvtps_ph(v, 0); _mm_storeu_si128(reinterpret_cast<__m128i *>(a), vec_f16); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp index cd46be617465b4..d07f7490f1bbed 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp @@ -599,10 +599,11 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str } } -// N and K must be multiple of 16 +// N must be multiple of 16 template -void transpose_16Nx16K(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { - for (size_t k = 0; k < K; k += 16) { +void transpose_16NxK(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { + size_t k = 0; + for (; k + 16 <= K; k += 16) { for (size_t n = 0; n < N; n += 16) { transpose_16x16_kernel(dst + n, src + n * src_stride, dst_stride, src_stride); } @@ -610,19 +611,24 @@ void transpose_16Nx16K(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size dst += 16 * dst_stride; src += 16; } + if (k < K) { + for (size_t n = 0; n < N; n += 16) { + transpose_16xK_kernel(dst + n, src + n * src_stride, K - k, dst_stride, src_stride); + } + } } #if defined(HAVE_AVX512F) -static void transpose_16Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { +static void transpose_16NxK(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // will treat as uint32_t transpose auto s = reinterpret_cast(src); auto d = reinterpret_cast(dst); - transpose_16Nx16K(d, s, reinterpret_cast(0), N, K >> 1, dst_stride, src_stride >> 1); + transpose_16NxK(d, s, reinterpret_cast(0), N, K >> 1, dst_stride, src_stride >> 1); } #endif template -void transpose_16Nx16K(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { +void transpose_16NxK(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // The layout for per token per head: // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) @@ -634,7 +640,7 @@ void transpose_16Nx16K(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, s s += src_stride + 2 * sizeof(float); t += src_stride; } - transpose_16Nx16K(dst, tmp, reinterpret_cast(0), N, K, dst_stride, src_stride); + transpose_16NxK(dst, tmp, reinterpret_cast(0), N, K, dst_stride, src_stride); } // dequant f16/u8 to float @@ -664,55 +670,55 @@ void dequant(TDST* dst, uint8_t* src, size_t N, size_t K) { #if defined(HAVE_AVX512F) // pack bf16/u8 to bf16 -static void pack_32x32_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t stride) { +static void pack_32x32_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_stride, size_t src_stride) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); for (size_t i = 0; i < 16; i++) { auto a = _mm512_loadu_si512(src); // [a1 a2 a3 a4 | a5 a6 a7 a8] total 512-bits in 8 64bits unit - auto b = _mm512_loadu_si512(src + stride); // [b1 b2 b3 b4 | b5 b6 b7 b8] total 512-bits + auto b = _mm512_loadu_si512(src + src_stride); // [b1 b2 b3 b4 | b5 b6 b7 b8] total 512-bits a = _mm512_permutexvar_epi64(midx, a); // [a1 a5 | a2 a6 | a3 a7 | a4 a8] b = _mm512_permutexvar_epi64(midx, b); // [b1 b5 | b2 b6 | b3 b7 | b4 b8] auto B0 = _mm512_unpacklo_epi16(a, b); // [ a1&b1 a2&b2 a3&b3 a4&b4] for each 128-bits lane, interleave word in low 64 bits auto B1 = _mm512_unpackhi_epi16(a, b); // [ a5&b5 a6&b6 a7&b7 a8&b8] for each 128-bits lane, interleave word in high 64 bits _mm512_storeu_si512(dst, B0); _mm512_storeu_si512(dst + 32, B1); - src += 2 * stride; - dst += 2 * stride; + src += 2 * src_stride; + dst += 2 * dst_stride; } } -static void pack_32x16_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t stride) { +static void pack_32x16_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_stride, size_t src_stride) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); for (size_t i = 0; i < 16; i++) { auto x = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src)); // [a1 a2 a3 a4] total 256-bits in 4 64bits unit - auto y = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src + stride)); // [b1 b2 b3 b4] total 256-bits + auto y = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src + src_stride)); // [b1 b2 b3 b4] total 256-bits auto a = _mm512_castsi256_si512(x); auto b = _mm512_castsi256_si512(y); a = _mm512_permutexvar_epi64(midx, a); // [a1 x | a2 x | a3 x | a4 x] b = _mm512_permutexvar_epi64(midx, b); // [b1 x | b2 x | b3 x | b4 x] auto B0 = _mm512_unpacklo_epi16(a, b); _mm512_storeu_si512(dst, B0); - src += 2 * stride; - dst += 2 * stride; + src += 2 * src_stride; + dst += 2 * dst_stride; } } -static void pack_32Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t stride) { +static void pack_32Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { for (size_t n = 0; n < N; n += 32) { size_t k = 0; for (; k + 32 <= K; k += 32) { - pack_32x32_kernel(dst + k * 2, src + k, stride); + pack_32x32_kernel(dst + k * 2, src + k, dst_stride, src_stride); } if (k < K) - pack_32x16_kernel(dst + k * 2, src + k, stride); + pack_32x16_kernel(dst + k * 2, src + k, dst_stride, src_stride); - dst += 32 * stride; - src += 32 * stride; + dst += 32 * dst_stride; + src += 32 * src_stride; } } -static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, size_t N, size_t K, size_t stride) { +static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // The layout for per token per head: // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) @@ -721,15 +727,15 @@ static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, siz for (size_t n = 0; n < N; n ++) { auto f = reinterpret_cast(s); attn_dequant_u8_kernel(s + 2 * sizeof(float), t, K, f[0], f[1]); - s += stride + 2 * sizeof(float); - t += stride; + s += src_stride + 2 * sizeof(float); + t += src_stride; } - pack_32Nx16K(dst, tmp, reinterpret_cast(0), N, K, stride); + pack_32Nx16K(dst, tmp, reinterpret_cast(0), N, K, dst_stride, src_stride); } #endif template -static void pack_32Nx16K(float* dst, T* src, float* tmp, size_t N, size_t K, size_t stride) { +static void pack_32Nx16K(float* dst, T* src, float* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // never called OPENVINO_THROW("pack_32Nx16K: should not be called."); } @@ -858,7 +864,7 @@ struct MHAHelper { void init_reorder_buffers(size_t batch, size_t kv_len_in_blocks) { _qk_scratch_b.resize({batch, kv_len_in_blocks, _Hk, _block_size * _S}); - _wv_scratch_b.resize({batch, kv_len_in_blocks, _Hk, _block_size * _S}); + _wv_scratch_b.resize({batch, kv_len_in_blocks, _Hk, _block_size * rnd_up(_S, _block_size)}); } // compute one block(such as 32 tokens) of query in M dimension: softmax(q_block*k')*v @@ -1307,7 +1313,7 @@ struct MHA { auto ithr = parallel_get_thread_num(); auto* k_ptr = k_cache.ptr(block_number, hk); auto* v_ptr = v_cache.ptr(block_number, hk); - transpose_16Nx16K(_helper._qk_scratch_b.template ptr(batch_in_reorder, kv_block, hk), + transpose_16NxK(_helper._qk_scratch_b.template ptr(batch_in_reorder, kv_block, hk), k_ptr, _helper._output.template ptr(ithr), _helper._block_size, @@ -1318,6 +1324,7 @@ struct MHA { _helper._output.template ptr(ithr), _helper._block_size, _helper._S, + rnd_up(_helper._S, _helper._block_size), _helper._S); } else { // need to decompress diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp index b39028792ee547..b719246e4976a1 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp @@ -133,6 +133,50 @@ inline void transpose_16x16_kernel(float* _dst, T* src, size_t dst_stride, size_ _mm512_storeu_si512(dst + 15 * dst_stride, rf); } +template +inline void transpose_16xK_kernel(float* _dst, T* src, size_t K, size_t dst_stride, size_t src_stride) { + auto* dst = reinterpret_cast(_dst); + __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; + r0 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src, K)); + r1 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + src_stride, K)); + r2 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 2 * src_stride, K)); + r3 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 3 * src_stride, K)); + r4 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 4 * src_stride, K)); + r5 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 5 * src_stride, K)); + r6 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 6 * src_stride, K)); + r7 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 7 * src_stride, K)); + r8 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 8 * src_stride, K)); + r9 = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 9 * src_stride, K)); + ra = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 10 * src_stride, K)); + rb = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 11 * src_stride, K)); + rc = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 12 * src_stride, K)); + rd = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 13 * src_stride, K)); + re = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 14 * src_stride, K)); + rf = _mm512_castps_si512(mm512_uni_loadu_tail_ps(src + 15 * src_stride, K)); + + transpose_m512i_16x16(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf); + +#define S(m) _mm512_storeu_si512(dst + 0x##m * dst_stride, r##m) +#define S8() S(0); S(1); S(2); S(3); S(4); S(5); S(6); S(7); + switch (K) { + case 8: S8(); break; + case 9: S8() S(8); break; + case 10: S8(); S(8); S(9); break; + case 11: S8(); S(8); S(9); S(a); break; + case 12: S8(); S(8); S(9); S(a); S(b); break; + case 13: S8(); S(8); S(9); S(a); S(b); S(c); break; + case 14: S8(); S(8); S(9); S(a); S(b); S(c); S(d); break; + case 15: S8(); S(8); S(9); S(a); S(b); S(c); S(d); S(e); break; + case 1: S(0); break; + case 2: S(0); S(1); break; + case 3: S(0); S(1); S(2); break; + case 4: S(0); S(1); S(2); S(3); break; + case 5: S(0); S(1); S(2); S(3); S(4); break; + case 6: S(0); S(1); S(2); S(3); S(4); S(5); break; + case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break; + } +} + inline void transpose_16x16_kernel(uint32_t* dst, uint32_t* src, size_t dst_stride, size_t src_stride) { __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; r0 = _mm512_loadu_si512(src); @@ -172,6 +216,50 @@ inline void transpose_16x16_kernel(uint32_t* dst, uint32_t* src, size_t dst_stri _mm512_storeu_si512(dst + 15 * dst_stride, rf); } +inline void transpose_16xK_kernel(uint32_t* dst, uint32_t* src, size_t K, size_t dst_stride, size_t src_stride) { + __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; + __mmask16 k = 0xffff >> (16 - K); + + r0 = _mm512_maskz_loadu_epi32(k, src); + r1 = _mm512_maskz_loadu_epi32(k, src + src_stride); + r2 = _mm512_maskz_loadu_epi32(k, src + 2 * src_stride); + r3 = _mm512_maskz_loadu_epi32(k, src + 3 * src_stride); + r4 = _mm512_maskz_loadu_epi32(k, src + 4 * src_stride); + r5 = _mm512_maskz_loadu_epi32(k, src + 5 * src_stride); + r6 = _mm512_maskz_loadu_epi32(k, src + 6 * src_stride); + r7 = _mm512_maskz_loadu_epi32(k, src + 7 * src_stride); + r8 = _mm512_maskz_loadu_epi32(k, src + 8 * src_stride); + r9 = _mm512_maskz_loadu_epi32(k, src + 9 * src_stride); + ra = _mm512_maskz_loadu_epi32(k, src + 10 * src_stride); + rb = _mm512_maskz_loadu_epi32(k, src + 11 * src_stride); + rc = _mm512_maskz_loadu_epi32(k, src + 12 * src_stride); + rd = _mm512_maskz_loadu_epi32(k, src + 13 * src_stride); + re = _mm512_maskz_loadu_epi32(k, src + 14 * src_stride); + rf = _mm512_maskz_loadu_epi32(k, src + 15 * src_stride); + + transpose_m512i_16x16(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf); + + switch (K) { + case 8: S8(); break; + case 9: S8() S(8); break; + case 10: S8(); S(8); S(9); break; + case 11: S8(); S(8); S(9); S(a); break; + case 12: S8(); S(8); S(9); S(a); S(b); break; + case 13: S8(); S(8); S(9); S(a); S(b); S(c); break; + case 14: S8(); S(8); S(9); S(a); S(b); S(c); S(d); break; + case 15: S8(); S(8); S(9); S(a); S(b); S(c); S(d); S(e); break; + case 1: S(0); break; + case 2: S(0); S(1); break; + case 3: S(0); S(1); S(2); break; + case 4: S(0); S(1); S(2); S(3); break; + case 5: S(0); S(1); S(2); S(3); S(4); break; + case 6: S(0); S(1); S(2); S(3); S(4); S(5); break; + case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break; + } +#undef S +#undef S8 +} + #elif defined(HAVE_AVX2) // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2 @@ -235,6 +323,64 @@ inline void transpose_16x16_kernel(float* dst, T* src, size_t dst_stride, size_t } } +template +inline void transpose_16xK_kernel(float* dst, T* src, size_t K, size_t dst_stride, size_t src_stride) { + __m256 r0, r1, r2, r3, r4, r5, r6, r7; + + if (K >= 8) { + for (int j = 0; j < 16; j += 8) { + r0 = mm256_uni_loadu_ps(src + src_stride * j); + r1 = mm256_uni_loadu_ps(src + src_stride * (1 + j)); + r2 = mm256_uni_loadu_ps(src + src_stride * (2 + j)); + r3 = mm256_uni_loadu_ps(src + src_stride * (3 + j)); + r4 = mm256_uni_loadu_ps(src + src_stride * (4 + j)); + r5 = mm256_uni_loadu_ps(src + src_stride * (5 + j)); + r6 = mm256_uni_loadu_ps(src + src_stride * (6 + j)); + r7 = mm256_uni_loadu_ps(src + src_stride * (7 + j)); + + transpose_8x8(r0, r1, r2, r3, r4, r5, r6, r7); + + _mm256_storeu_ps(dst + j, r0); + _mm256_storeu_ps(dst + j + dst_stride, r1); + _mm256_storeu_ps(dst + j + dst_stride * 2, r2); + _mm256_storeu_ps(dst + j + dst_stride * 3, r3); + _mm256_storeu_ps(dst + j + dst_stride * 4, r4); + _mm256_storeu_ps(dst + j + dst_stride * 5, r5); + _mm256_storeu_ps(dst + j + dst_stride * 6, r6); + _mm256_storeu_ps(dst + j + dst_stride * 7, r7); + } + src += 8; + dst += 8 * dst_stride; + K -= 8; + } + if (K > 0) { + for (int j = 0; j < 16; j += 8) { + r0 = mm256_uni_loadu_tail_ps(src + src_stride * j, K); + r1 = mm256_uni_loadu_tail_ps(src + src_stride * (1 + j), K); + r2 = mm256_uni_loadu_tail_ps(src + src_stride * (2 + j), K); + r3 = mm256_uni_loadu_tail_ps(src + src_stride * (3 + j), K); + r4 = mm256_uni_loadu_tail_ps(src + src_stride * (4 + j), K); + r5 = mm256_uni_loadu_tail_ps(src + src_stride * (5 + j), K); + r6 = mm256_uni_loadu_tail_ps(src + src_stride * (6 + j), K); + r7 = mm256_uni_loadu_tail_ps(src + src_stride * (7 + j), K); + + transpose_8x8(r0, r1, r2, r3, r4, r5, r6, r7); + +#define S(m) _mm256_storeu_ps(dst + j + m * dst_stride, r##m) + switch (K) { + case 1: S(0); break; + case 2: S(0); S(1); break; + case 3: S(0); S(1); S(2); break; + case 4: S(0); S(1); S(2); S(3); break; + case 5: S(0); S(1); S(2); S(3); S(4); break; + case 6: S(0); S(1); S(2); S(3); S(4); S(5); break; + case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break; + } +#undef S + } + } +} + #else template @@ -246,6 +392,15 @@ inline void transpose_16x16_kernel(TDST* dst, TSRC* src, size_t dst_stride, size } } +template +inline void transpose_16xK_kernel(TDST* dst, TSRC* src, size_t K, size_t dst_stride, size_t src_stride) { + for (size_t i = 0; i < K; i++) { + for (size_t j = 0; j < 16; j++) { + dst[i * dst_stride + j] = static_cast(src[i + j * src_stride]); + } + } +} + #endif } // namespace XARCH diff --git a/src/plugins/intel_cpu/src/nodes/roi_align_rotated.cpp b/src/plugins/intel_cpu/src/nodes/roi_align_rotated.cpp index 2ce6f78e234389..e75bef1213d2cf 100644 --- a/src/plugins/intel_cpu/src/nodes/roi_align_rotated.cpp +++ b/src/plugins/intel_cpu/src/nodes/roi_align_rotated.cpp @@ -4,7 +4,7 @@ #include "roi_align_rotated.h" -#include +#include #include "common/cpu_convert.h" #include "openvino/reference/roi_align.hpp" @@ -15,7 +15,7 @@ namespace node { ROIAlignRotated::ROIAlignRotated(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) { - const auto roiAlign = ov::as_type_ptr(op); + const auto roiAlign = ov::as_type_ptr(op); pooledH = roiAlign->get_pooled_h(); pooledW = roiAlign->get_pooled_w(); spatialScale = roiAlign->get_spatial_scale(); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index c230d468cd7922..85e8c2e10615b7 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -303,8 +303,6 @@ std::vector disabledTestPatterns() { R"(.*(nightly|smoke)_MM_Brgemm_Static/MatMulLayerCPUTest.CompareWithRefs/MatMul_IS=\[\]_\[\]_TS=\(\(55.12\)\)_\(\(12.55\)\)_.*config=\(INFERENCE_PRECISION_HINT=bf16_\)_Fused=Multiply\(PerChannel\)_primitive=brgemm_avx512.*)", R"(.*smoke_MM_Brgemm_Dynamic_Fusing/MatMulLayerCPUTest.CompareWithRefs/MatMul_IS=\[\?.\?\]_\[\?.33\]_TS=\(\(16.12\)_\(33.7\)_\(16.12\)\)_\(\(12.33\)_\(7.33\)_\(12.33\)\)_transpose_a=0_transpose_b=0_secondaryInputType=PARAMETER_netPRC=f32_inPRC=undefined_outPRC=undefined_trgDev=CPUconfig=\(INFERENCE_PRECISION_HINT=bf16_\)_Fused=Multiply\(PerChannel\)_primitive=brgemm_avx512.*)", // Issue: 140389 - R"(.*smoke_GatherCompressedWeights_basic/GatherWeightsDecompression.CompareWithRefs.*i4.*)", - R"(.*smoke_MatmulAndGatherSharedWeightsDecompression.*weights_precision=i4.*decompression_subtract=1.*)", R"(.*FQLayerDQBias.smoke_CompareWithRefs.*)", R"(.*smoke_matmulBrgemmInt8/MatmulBrgemmInt8Test.CompareWithRefs.*MatMul.*InputType=i8_OutputType=i8.*)", R"(.*smoke_Snippets_MHAWOTransposeOnInputs_4D/MHAWOTransposeOnInputs.CompareWithRefImpl.*)", diff --git a/src/plugins/intel_cpu/tools/dump_check/dump_check.py b/src/plugins/intel_cpu/tools/dump_check/dump_check.py index f2426e3e1333f6..0ac1cd41da864e 100644 --- a/src/plugins/intel_cpu/tools/dump_check/dump_check.py +++ b/src/plugins/intel_cpu/tools/dump_check/dump_check.py @@ -75,14 +75,14 @@ def fill_tensors_from_image(input, input_file): class IEB: precision_table = { - 10:(np.float32, 4), - 12:(np.int16, 2), - 40:(np.uint8, 1), - 50:(np.int8, 1), - 70:(np.int32, 4), - 74:(np.uint32, 4), - 72:(np.int64, 8), - 73:(np.uint64, 8) + 5:(np.float32, 4), + 9:(np.int16, 2), + 14:(np.uint8, 1), + 8:(np.int8, 1), + 10:(np.int32, 4), + 15:(np.uint32, 4), + 11:(np.int64, 8), + 17:(np.uint64, 8) } @classmethod diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt index 2f3d9127dde7e0..dc24e404c74b81 100644 --- a/src/plugins/intel_gpu/CMakeLists.txt +++ b/src/plugins/intel_gpu/CMakeLists.txt @@ -76,8 +76,8 @@ set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_REL # Workaround to avoid warnings during LTO build if(CMAKE_COMPILER_IS_GNUCXX) - set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized" - LINK_FLAGS_RELWITHDEBINFO "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized") + set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized -Wno-stringop-overflow" + LINK_FLAGS_RELWITHDEBINFO "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized -Wno-stringop-overflow") endif() if(ENABLE_TESTS) diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp new file mode 100644 index 00000000000000..45416b4e53810b --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp @@ -0,0 +1,94 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/op/op.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +class SDPA : public ov::op::v13::ScaledDotProductAttention { +public: + OPENVINO_OP("SDPA", "gpu_opset"); + + SDPA() = default; + + SDPA(const ov::Output& Q, + const ov::Output& K, + const ov::Output& V, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const bool is_causal, + const ov::element::Type output_type = ov::element::undefined); + + SDPA(const ov::Output& Q, + const ov::Output& K, + const ov::Output& V, + const ov::Output& attn_mask, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const bool is_causal, + const ov::element::Type output_type = ov::element::undefined); + + SDPA(const ov::Output& Q, + const ov::Output& K, + const ov::Output& V, + const ov::Output& attn_mask, + const ov::Output& scale, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const bool is_causal, + const ov::element::Type output_type = ov::element::undefined); + + bool visit_attributes(ov::AttributeVisitor &visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + bool get_causal() const { return m_is_causal; } + + std::vector get_input0_transpose_order() const { return m_order_q; } + std::vector get_input1_transpose_order() const { return m_order_k; } + std::vector get_input2_transpose_order() const { return m_order_v; } + std::vector get_output_transpose_order() const { return m_order_out; } + ov::element::Type get_output_type() const { return m_output_type; } + + static std::vector default_order(size_t rank) { + std::vector order(rank); + std::iota(order.begin(), order.end(), 0); + return order; + } + +protected: + std::vector m_order_q; + std::vector m_order_k; + std::vector m_order_v; + std::vector m_order_out; + bool m_is_causal; + ov::element::Type m_output_type; +}; + +std::vector shape_infer(const SDPA* op, + std::vector input_shapes, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out); + + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 68cb607b116f24..7979870275d240 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -263,9 +263,10 @@ REGISTER_FACTORY(v12, ScatterElementsUpdate); // ------------------------------ Supported v13 ops ----------------------------- // REGISTER_FACTORY(v13, Multinomial); +REGISTER_FACTORY(v13, ScaledDotProductAttention); // ------------------------------ Supported v14 ops ----------------------------- // -REGISTER_FACTORY(v14, ROIAlignRotated); +REGISTER_FACTORY(v15, ROIAlignRotated); // --------------------------- Supported internal ops --------------------------- // REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal); @@ -283,3 +284,4 @@ REGISTER_FACTORY(internal, SwiGLU); REGISTER_FACTORY(internal, IndirectGemm); REGISTER_FACTORY(internal, Convolution); REGISTER_FACTORY(internal, Placeholder); +REGISTER_FACTORY(internal, SDPA); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp new file mode 100644 index 00000000000000..f4f32a6af37d87 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp @@ -0,0 +1,95 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "primitive.hpp" + +namespace cldnn { + +struct scaled_dot_product_attention : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(scaled_dot_product_attention) + + scaled_dot_product_attention() : primitive_base("", {}) {} + + /// @brief Constructs scaled_dot_product_attention primitive. + /// @param id This primitive id. + /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale]). + /// @param is_causal If true, assumes causal attention masking. In this case attention_mask input is ignored. + scaled_dot_product_attention(const primitive_id& id, + const std::vector inputs, + bool is_causal, + const std::vector& input_q_transpose_order = {}, + const std::vector& input_k_transpose_order = {}, + const std::vector& input_v_transpose_order = {}, + const std::vector& output_transpose_order = {}, + const padding& output_padding = padding()) + : primitive_base(id, inputs, {output_padding}) + , is_causal(is_causal) + , has_attn_mask_input(inputs.size() > 3) + , has_scale_input(inputs.size() > 4) + , input_q_transpose_order(input_q_transpose_order) + , input_k_transpose_order(input_k_transpose_order) + , input_v_transpose_order(input_v_transpose_order) + , output_transpose_order(output_transpose_order) {} + + + bool is_causal = false; + bool has_attn_mask_input = false; + bool has_scale_input = false; + + std::vector input_q_transpose_order; + std::vector input_k_transpose_order; + std::vector input_v_transpose_order; + std::vector output_transpose_order; + + size_t hash() const override { + size_t seed = primitive::hash(); + seed = hash_combine(seed, is_causal); + seed = hash_combine(seed, has_attn_mask_input); + seed = hash_combine(seed, has_scale_input); + seed = hash_range(seed, input_q_transpose_order.begin(), input_q_transpose_order.end()); + seed = hash_range(seed, input_k_transpose_order.begin(), input_k_transpose_order.end()); + seed = hash_range(seed, input_v_transpose_order.begin(), input_v_transpose_order.end()); + seed = hash_range(seed, output_transpose_order.begin(), output_transpose_order.end()); + return seed; + } + + bool operator==(const primitive& rhs) const override { + if (!compare_common_params(rhs)) + return false; + + auto rhs_casted = downcast(rhs); + + return is_causal == rhs_casted.is_causal && + has_attn_mask_input == rhs_casted.has_attn_mask_input && + has_scale_input == rhs_casted.has_scale_input && + input_q_transpose_order == rhs_casted.input_q_transpose_order && + input_k_transpose_order == rhs_casted.input_k_transpose_order && + input_v_transpose_order == rhs_casted.input_v_transpose_order && + output_transpose_order == rhs_casted.output_transpose_order; + } + + void save(BinaryOutputBuffer& ob) const override { + primitive_base::save(ob); + ob << is_causal; + ob << has_attn_mask_input; + ob << has_scale_input; + ob << input_q_transpose_order; + ob << input_k_transpose_order; + ob << input_v_transpose_order; + ob << output_transpose_order; + } + + void load(BinaryInputBuffer& ib) override { + primitive_base::load(ib); + ib >> is_causal; + ib >> has_attn_mask_input; + ib >> has_scale_input; + ib >> input_q_transpose_order; + ib >> input_k_transpose_order; + ib >> input_v_transpose_order; + ib >> output_transpose_order; + } +}; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/gemm.cpp b/src/plugins/intel_gpu/src/graph/gemm.cpp index 4af921d566bffc..a8b196bd45885f 100644 --- a/src/plugins/intel_gpu/src/graph/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/gemm.cpp @@ -272,6 +272,9 @@ std::string gemm_inst::to_string(gemm_node const& node) { gemm_info.add("transpose_input1", transpose_input1); gemm_info.add("indirect_input0", indirect_input0); gemm_info.add("indirect_input1", indirect_input1); + gemm_info.add("trasnpose_order_input0", desc->input0_transpose_order); + gemm_info.add("trasnpose_order_input1", desc->input1_transpose_order); + gemm_info.add("trasnpose_order_output", desc->output_transpose_order); node_info->add("gemm info", gemm_info); node_info->dump(primitive_description); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 8bab7e44dca4fa..1a235f1293f382 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -681,14 +681,6 @@ void prepare_buffer_fusing::run(program& p) { if (gather_prim) { update_dep(gather_prim); } - - // Fallback to ocl impl since oneDNN doesn't support dynamic paddings - for (auto user : node.get_users()) { - if (user->get_preferred_impl_type() == impl_types::onednn) { - GPU_DEBUG_TRACE_DETAIL << user->id() << ": change impl to ocl because of dynamic input paddings\n"; - user->set_preferred_impl_type(impl_types::ocl); - } - } } }); program_helpers::do_for_types(*node, [](read_value_node& node) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index 40264d856035e2..855ae9c421b235 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -93,6 +93,7 @@ void register_implementations() { REGISTER_OCL(eye); REGISTER_OCL(unique_count); REGISTER_OCL(unique_gather); + REGISTER_OCL(scaled_dot_product_attention); } } // namespace ocl diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index a2f3202f816671..f0d2a72e51d848 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -74,6 +74,7 @@ #include "intel_gpu/primitives/eye.hpp" #include "intel_gpu/primitives/unique.hpp" #include "intel_gpu/primitives/kv_cache.hpp" +#include "intel_gpu/primitives/scaled_dot_product_attention.hpp" namespace cldnn { namespace ocl { @@ -172,6 +173,7 @@ REGISTER_OCL(gather_nonzero); REGISTER_OCL(eye); REGISTER_OCL(unique_count); REGISTER_OCL(unique_gather); +REGISTER_OCL(scaled_dot_product_attention); #undef REGISTER_OCL diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp new file mode 100644 index 00000000000000..d60098aca74588 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -0,0 +1,138 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "primitive_base.hpp" +#include "scaled_dot_product_attention_inst.h" +#include "sdpa/sdpa_kernel_selector.h" +#include "sdpa/sdpa_kernel_base.h" + +namespace cldnn { +namespace ocl { +struct scaled_dot_product_attention_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + using kernel_selector_t = kernel_selector::sdpa_kernel_selector; + using kernel_params_t = kernel_selector::sdpa_params; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::scaled_dot_product_attention_impl) + + std::unique_ptr clone() const override { + return make_unique(*this); + } + + void load(BinaryInputBuffer& ib) override { + parent::load(ib); + if (is_dynamic()) { + auto& kernel_selector = kernel_selector_t::Instance(); + auto kernel_impl = kernel_selector.GetImplementation(_kernel_data.kernelName); + kernel_impl->GetUpdateDispatchDataFunc(_kernel_data); + } + } + + static kernel_selector::sdpa_configuration get_sdpa_configuration(const kernel_impl_params& impl_param) { + kernel_selector::sdpa_configuration config; + + auto transpose_pshape = [](const ov::PartialShape& pshape, const std::vector& order) { + if (order.empty()) + return pshape; + + auto transposed_pshape = ov::PartialShape::dynamic(pshape.rank()); + for (size_t i = 0; i < order.size(); i++) { + transposed_pshape[i] = pshape[order[i]]; + } + return transposed_pshape; + }; + + const auto& prim = impl_param.typed_desc(); + const auto query_shape = transpose_pshape(impl_param.get_input_layout(0).get_partial_shape(), prim->input_q_transpose_order); + const auto key_shape = transpose_pshape(impl_param.get_input_layout(1).get_partial_shape(), prim->input_k_transpose_order); + const auto value_shape = transpose_pshape(impl_param.get_input_layout(2).get_partial_shape(), prim->input_v_transpose_order); + + OPENVINO_ASSERT(key_shape == value_shape, "[GPU] The shapes of key and value inputs are expected to be equal"); + for (size_t i = 0; i < query_shape.size(); ++i) { + if (query_shape[i].is_static() && key_shape[i].is_static() && value_shape[i].is_static()) { + if (query_shape[i].get_length() > key_shape[i].get_length()) { + config.broadcast_axis = prim->input_k_transpose_order[i]; + config.group_size = query_shape[i].get_length() / key_shape[i].get_length(); + } + } + } + + if (query_shape[query_shape.size() - 1].is_static()) + config.head_size = query_shape[query_shape.size() - 1].get_length(); + + config.is_causal = prim->is_causal; + + return config; + } + + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_dynamic) { + auto params = get_default_params(impl_param, is_dynamic); + + const auto inputs_num = impl_param.input_layouts.size(); + params.inputs.resize(inputs_num); + for (size_t i = 0; i < inputs_num; i++) { + params.inputs[i] = convert_data_tensor(impl_param.get_input_layout(i)); + } + + params.conf = get_sdpa_configuration(impl_param); + + const auto& prim = impl_param.typed_desc(); + params.input0_order = prim->input_q_transpose_order; + params.input1_order = prim->input_k_transpose_order; + params.input2_order = prim->input_v_transpose_order; + params.output_order = prim->output_transpose_order; + + params.set_dynamic_shape_offsets(); + + return params; + } + + static std::unique_ptr create(const typed_program_node& arg, const kernel_impl_params& impl_param) { + auto sdpa_kernel_params = get_kernel_params(impl_param, impl_param.is_dynamic()); + auto& sdpa_kernel_selector = kernel_selector_t::Instance(); + auto kd = sdpa_kernel_selector.get_best_kernel(sdpa_kernel_params); + + return cldnn::make_unique(kd); + } + + void update_dispatch_data(const kernel_impl_params& impl_param) override { + auto kernel_params = get_kernel_params(impl_param, true); + (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + } +}; + +namespace detail { + +attach_scaled_dot_product_attention_impl::attach_scaled_dot_product_attention_impl() { + using sdpa_prim = scaled_dot_product_attention; + + auto types = { + data_types::f32, + data_types::f16, + }; + + auto formats = { + format::bfyx, + }; + + implementation_map::add(impl_types::ocl, + shape_types::static_shape, + scaled_dot_product_attention_impl::create, + types, + formats); + + implementation_map::add(impl_types::ocl, + shape_types::dynamic_shape, + scaled_dot_product_attention_impl::create, + types, + formats); +} + +} // namespace detail +} // namespace ocl +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::scaled_dot_product_attention_impl) +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::scaled_dot_product_attention) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp index cdc78316b03d47..10c1a970d1793b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp @@ -64,6 +64,8 @@ struct gemm_onednn : typed_primitive_onednn_impl { dnnl::memory::data_type& out_dt, dnnl::memory::dims& in0_dims, dnnl::memory::dims& in1_dims, + dnnl::memory::dims& in0_strides, + dnnl::memory::dims& in1_strides, dnnl::memory::dims& out_dims, dnnl::memory::format_tag& in0_fmt, dnnl::memory::format_tag& in1_fmt, @@ -111,6 +113,22 @@ struct gemm_onednn : typed_primitive_onednn_impl { in1_fmt = onednn::convert_gemm_data_format(in1_dims, in1_l.format); out_fmt = onednn::convert_gemm_data_format(out_dims, out_l.format); + if (in0_l.data_padding) { + dnnl::memory::dims in0_padded_dims = onednn::convert_gemm_tensor(in0_l.get_buffer_size(), rank, batched_dims_can_be_removed); + if (prim->transpose_input0) { + std::swap(in0_padded_dims[in0_padded_dims.size() - 1], in0_padded_dims[in0_padded_dims.size() - 2]); + } + in0_strides = onednn::get_strides(in0_padded_dims); + } + + if (in1_l.data_padding) { + dnnl::memory::dims in1_padded_dims = onednn::convert_gemm_tensor(in1_l.get_buffer_size(), rank, batched_dims_can_be_removed); + if (prim->transpose_input1) { + std::swap(in1_padded_dims[in1_padded_dims.size() - 1], in1_padded_dims[in1_padded_dims.size() - 2]); + } + in1_strides = onednn::get_strides(in1_padded_dims); + } + if (prim->transpose_input0) { in0_fmt = transpose_format(in0_fmt); std::swap(in0_dims[in0_dims.size() - 1], in0_dims[in0_dims.size() - 2]); @@ -130,6 +148,19 @@ struct gemm_onednn : typed_primitive_onednn_impl { } } + static dnnl::memory::desc get_input_memory_desc(const dnnl::memory::dims& dims, + dnnl::memory::data_type dt, + dnnl::memory::format_tag fmt, + const dnnl::memory::dims& strides) { + dnnl::memory::desc res; + if (strides.empty()) { + res = dnnl::memory::desc(dims, dt, fmt); + } else { + res = dnnl::memory::desc(dims, dt, strides); + } + return res; + } + static std::shared_ptr get_gemm_primitive_descriptor(const kernel_impl_params& impl_params, const dnnl::primitive_attr& attr = dnnl::primitive_attr()) { auto& engine = impl_params.prog->get_engine(); @@ -146,16 +177,19 @@ struct gemm_onednn : typed_primitive_onednn_impl { dnnl::memory::dims out_dims; dnnl::memory::dims bias_dims; + dnnl::memory::dims in0_strides; + dnnl::memory::dims in1_strides; + dnnl::memory::format_tag in0_fmt; dnnl::memory::format_tag in1_fmt; dnnl::memory::format_tag out_fmt; dnnl::memory::format_tag bias_fmt; - get_gemm_primitive_md(impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, out_dims, in0_fmt, in1_fmt, out_fmt, - gemm_with_bias, bias_dt, bias_dims, bias_fmt); + get_gemm_primitive_md(impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, in0_strides, in1_strides, + out_dims, in0_fmt, in1_fmt, out_fmt, gemm_with_bias, bias_dt, bias_dims, bias_fmt); - dnnl::memory::desc in0_md(in0_dims, in0_dt, in0_fmt); - dnnl::memory::desc in1_md(in1_dims, in1_dt, in1_fmt); + dnnl::memory::desc in0_md = get_input_memory_desc(in0_dims, in0_dt, in0_fmt, in0_strides); + dnnl::memory::desc in1_md = get_input_memory_desc(in1_dims, in1_dt, in1_fmt, in1_strides); dnnl::memory::desc out_md(out_dims, out_dt, out_fmt); if (gemm_with_bias) { @@ -199,13 +233,16 @@ struct gemm_onednn : typed_primitive_onednn_impl { dnnl::memory::dims out_dims; dnnl::memory::dims bias_dims; + dnnl::memory::dims in0_strides; + dnnl::memory::dims in1_strides; + dnnl::memory::format_tag in0_fmt; dnnl::memory::format_tag in1_fmt; dnnl::memory::format_tag out_fmt; dnnl::memory::format_tag bias_fmt; - get_gemm_primitive_md(*impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, out_dims, in0_fmt, in1_fmt, out_fmt, - gemm_with_bias, bias_dt, bias_dims, bias_fmt); + get_gemm_primitive_md(*impl_params, in0_dt, in1_dt, out_dt, in0_dims, in1_dims, in0_strides, in1_strides, + out_dims, in0_fmt, in1_fmt, out_fmt, gemm_with_bias, bias_dt, bias_dims, bias_fmt); ob << make_data(&in0_dt, sizeof(dnnl::memory::data_type)); ob << make_data(&in1_dt, sizeof(dnnl::memory::data_type)); @@ -215,6 +252,9 @@ struct gemm_onednn : typed_primitive_onednn_impl { ob << in1_dims; ob << out_dims; + ob << in0_strides; + ob << in1_strides; + ob << make_data(&in0_fmt, sizeof(dnnl::memory::format_tag)); ob << make_data(&in1_fmt, sizeof(dnnl::memory::format_tag)); ob << make_data(&out_fmt, sizeof(dnnl::memory::format_tag)); @@ -248,6 +288,9 @@ struct gemm_onednn : typed_primitive_onednn_impl { dnnl::memory::dims out_dims; dnnl::memory::dims bias_dims; + dnnl::memory::dims in0_strides; + dnnl::memory::dims in1_strides; + dnnl::memory::format_tag in0_fmt = dnnl::memory::format_tag::undef; dnnl::memory::format_tag in1_fmt = dnnl::memory::format_tag::undef; dnnl::memory::format_tag out_fmt = dnnl::memory::format_tag::undef; @@ -261,6 +304,9 @@ struct gemm_onednn : typed_primitive_onednn_impl { ib >> in1_dims; ib >> out_dims; + ib >> in0_strides; + ib >> in1_strides; + ib >> make_data(&in0_fmt, sizeof(dnnl::memory::format_tag)); ib >> make_data(&in1_fmt, sizeof(dnnl::memory::format_tag)); ib >> make_data(&out_fmt, sizeof(dnnl::memory::format_tag)); @@ -271,8 +317,8 @@ struct gemm_onednn : typed_primitive_onednn_impl { ib >> make_data(&bias_fmt, sizeof(dnnl::memory::format_tag)); } - dnnl::memory::desc in0_md(in0_dims, in0_dt, in0_fmt); - dnnl::memory::desc in1_md(in1_dims, in1_dt, in1_fmt); + dnnl::memory::desc in0_md = get_input_memory_desc(in0_dims, in0_dt, in0_fmt, in0_strides); + dnnl::memory::desc in1_md = get_input_memory_desc(in1_dims, in1_dt, in1_fmt, in1_strides); dnnl::memory::desc out_md(out_dims, out_dt, out_fmt); if (gemm_with_bias) { diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index f77b4469b1f619..6214a8db4d8255 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -94,6 +94,12 @@ dnnl::memory::dims flatten_tensor(cldnn::tensor t) { return {static_cast(t.count())}; } +dnnl::memory::dims get_strides(dnnl::memory::dims dims) { + dnnl::memory::dims strides(dims.size(), dnnl::memory::dim(1)); + std::partial_sum(dims.rbegin(), dims.rend() - 1, strides.rbegin() + 1, std::multiplies()); + return strides; +} + dnnl::memory::data_type convert_data_type(cldnn::data_types dt) { switch (dt) { case cldnn::data_types::f32: return dnnl::memory::data_type::f32; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp index a789107e2cf2bb..e8127b698f57d5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp @@ -28,6 +28,7 @@ dnnl::memory::dims convert_tensor(cldnn::tensor t, size_t dims = 2, bool is_grou dnnl::memory::dims convert_gemm_tensor(cldnn::tensor t, size_t dims, bool batched_dims_can_be_removed); dnnl::memory::dims convert_spatials(cldnn::tensor t, size_t dims = 2); dnnl::memory::dims flatten_tensor(cldnn::tensor t); +dnnl::memory::dims get_strides(dnnl::memory::dims dims); dnnl::memory::data_type convert_data_type(cldnn::data_types dt); dnnl::memory::format_tag convert_data_format(cldnn::format fmt); cldnn::format convert_data_format(dnnl::memory::format_tag fmt); diff --git a/src/plugins/intel_gpu/src/graph/include/scaled_dot_product_attention_inst.h b/src/plugins/intel_gpu/src/graph/include/scaled_dot_product_attention_inst.h new file mode 100644 index 00000000000000..cecb2a0f609550 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/scaled_dot_product_attention_inst.h @@ -0,0 +1,40 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "intel_gpu/primitives/scaled_dot_product_attention.hpp" +#include "primitive_inst.h" + +#include + +namespace cldnn { + +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } + std::vector get_shape_infer_dependencies() const override { return {}; } +}; +using scaled_dot_product_attention_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + template + static std::vector calc_output_layouts(scaled_dot_product_attention_node const& /*node*/, const kernel_impl_params& impl_param); + static layout calc_output_layout(scaled_dot_product_attention_node const& node, kernel_impl_params const& impl_param); + static std::string to_string(scaled_dot_product_attention_node const& node); + + typed_primitive_inst(network& network, scaled_dot_product_attention_node const& desc); +}; + +using scaled_dot_product_attention_inst = typed_primitive_inst; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/roi_align.cpp b/src/plugins/intel_gpu/src/graph/roi_align.cpp index 106f92623142a6..22e5dd4bf98f6c 100644 --- a/src/plugins/intel_gpu/src/graph/roi_align.cpp +++ b/src/plugins/intel_gpu/src/graph/roi_align.cpp @@ -50,7 +50,7 @@ std::vector roi_align_inst::calc_output_layouts(roi_align_node const& no output_shapes = shape_infer(&op, input_shapes); if (primitive->roi_mode == roi_align::ROIMode::rotated) { - PERFORM_SHAPE_INFERENCE(ov::op::v14::ROIAlignRotated); + PERFORM_SHAPE_INFERENCE(ov::op::v15::ROIAlignRotated); } else { PERFORM_SHAPE_INFERENCE(ov::op::v3::ROIAlign); } diff --git a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp new file mode 100644 index 00000000000000..42e5aeb9f1302e --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "scaled_dot_product_attention_inst.h" + +#include "primitive_type_base.h" +#include "intel_gpu/runtime/error_handler.hpp" +#include "json_object.h" +#include +#include + +#include "scaled_dot_product_attention_shape_inference.hpp" +#include "intel_gpu/op/sdpa.hpp" + +namespace cldnn { +GPU_DEFINE_PRIMITIVE_TYPE_ID(scaled_dot_product_attention) + +layout scaled_dot_product_attention_inst::calc_output_layout(scaled_dot_product_attention_node const& /* node */, + kernel_impl_params const& impl_param) { + auto desc = impl_param.typed_desc(); + + return impl_param.get_input_layout(0); +} + +template +std::vector scaled_dot_product_attention_inst::calc_output_layouts(scaled_dot_product_attention_node const& /*node*/, + const kernel_impl_params& impl_param) { + auto prim = impl_param.typed_desc(); + auto input0_layout = impl_param.get_input_layout(0); + + auto default_out_dt = data_type_traits::is_floating_point(input0_layout.data_type) ? input0_layout.data_type : data_types::f32; + auto output_type = prim->output_data_types[0].value_or(default_out_dt); + + if (impl_param.has_fused_primitives()) { + output_type = impl_param.get_output_element_type(); + } + + ov::intel_gpu::op::SDPA op; + + std::vector input_shapes; + for (size_t i = 0; i < impl_param.input_layouts.size(); i++) { + input_shapes.push_back(impl_param.get_input_layout(0).get()); + } + + std::vector output_shapes = ov::intel_gpu::op::shape_infer(&op, + input_shapes, + prim->input_q_transpose_order, + prim->input_k_transpose_order, + prim->input_v_transpose_order, + prim->output_transpose_order); + + cldnn::format output_format = input0_layout.format; + + return { layout{output_shapes[0], output_type, output_format, prim->output_paddings[0]} }; +} + +template std::vector scaled_dot_product_attention_inst::calc_output_layouts(scaled_dot_product_attention_node const& node, + const kernel_impl_params& impl_param); + +std::string scaled_dot_product_attention_inst::to_string(scaled_dot_product_attention_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + std::stringstream primitive_description; + + json_composite scaled_dot_product_attention_info; + scaled_dot_product_attention_info.add("input id", input.id()); + scaled_dot_product_attention_info.add("is_causal", desc->is_causal); + scaled_dot_product_attention_info.add("has_attn_mask_input", desc->has_attn_mask_input); + scaled_dot_product_attention_info.add("has_scale_input", desc->has_scale_input); + scaled_dot_product_attention_info.add("input_q_transpose_order", desc->input_q_transpose_order); + scaled_dot_product_attention_info.add("input_k_transpose_order", desc->input_k_transpose_order); + scaled_dot_product_attention_info.add("input_v_transpose_order", desc->input_v_transpose_order); + scaled_dot_product_attention_info.add("output_transpose_order", desc->output_transpose_order); + + node_info->add("scaled_dot_product_attention_info", scaled_dot_product_attention_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +scaled_dot_product_attention_inst::typed_primitive_inst(network& network, scaled_dot_product_attention_node const& node) + : parent(network, node) {} + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl new file mode 100644 index 00000000000000..14cef4010c6bea --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -0,0 +1,1169 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/common.cl" +#include "include/batch_headers/sub_group_block_read.cl" +#include "include/batch_headers/sub_group_block_write.cl" +#include "include/batch_headers/sub_group_shuffle.cl" + +// query_input [batch, heads_num, q_len, head_size] +// key_input [batch, kv_heads_num, kv_len, head_size] +// value_input [batch, kv_heads_num, kv_len, head_size] +// attn_mask [1, 1, q_len, kv_len] +// output [batch, heads_num, q_len, head_size] +// exp_sums [batch, heads_num, q_len, partition_idx] +// max_logits [batch, heads_num, q_len, partition_idx] +// tmp_out [batch, heads_num, q_len, partition_idx, head_size] + + +inline uint FUNC(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if INPUT0_SIMPLE + return GET_DATA_INDEX_6D_SAFE(INPUT0, b, f, w, z, y, x); +#else +#if INPUT0_DIMS == 4 + return INPUT0_GET_INDEX_SAFE(b, f, y, x); +#elif INPUT0_DIMS == 5 + return INPUT0_GET_INDEX_SAFE(b, f, z, y, x); +#elif INPUT0_DIMS == 6 + return INPUT0_GET_INDEX_SAFE(b, f, w, z, y, x); +#else +# error sdpa_ref.cl : Unsupported input 0 format +#endif +#endif +} + +inline uint FUNC(get_input0_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef INPUT0_DIMS_ORDER + return FUNC_CALL(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT0_DIMS_ORDER); +#else + return FUNC_CALL(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x); +#endif +} + +inline uint FUNC(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef DO_BROADCAST_KEY_VALUE + DO_BROADCAST_KEY_VALUE; +#endif +#if INPUT1_SIMPLE + return GET_DATA_INDEX_6D_SAFE(INPUT1, b, f, w, z, y, x); +#else +#if INPUT1_DIMS == 4 + return INPUT1_GET_INDEX_SAFE(b, f, y, x); +#elif INPUT1_DIMS == 5 + return INPUT1_GET_INDEX_SAFE(b, f, z, y, x); +#elif INPUT1_DIMS == 6 + return INPUT1_GET_INDEX_SAFE(b, f, w, z, y, x); +#else +# error sdpa_ref.cl : Unsupported input 1 format +#endif +#endif +} + +inline uint FUNC(get_input1_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef INPUT1_DIMS_ORDER + return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT1_DIMS_ORDER); +#else + return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x); +#endif +} + +inline uint FUNC(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef DO_BROADCAST_KEY_VALUE + DO_BROADCAST_KEY_VALUE; +#endif +#if INPUT2_SIMPLE + return GET_DATA_INDEX_6D_SAFE(INPUT2, b, f, w, z, y, x); +#else +#if INPUT2_DIMS == 4 + return INPUT2_GET_INDEX_SAFE(b, f, y, x); +#elif INPUT2_DIMS == 5 + return INPUT2_GET_INDEX_SAFE(b, f, z, y, x); +#elif INPUT2_DIMS == 6 + return INPUT2_GET_INDEX_SAFE(b, f, w, z, y, x); +#else +# error sdpa_ref.cl : Unsupported input 1 format +#endif +#endif +} + +inline uint FUNC(get_input2_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef INPUT2_DIMS_ORDER + return FUNC_CALL(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT2_DIMS_ORDER); +#else + return FUNC_CALL(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x); +#endif +} + +#define VALUE_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT2_TYPE, 1, ptr, offset) +#define SUBGROUPS_PER_WG (HEAD_SIZE / SUBGROUP_SIZE) + +#ifdef SDPA_STAGE_0 + +#if TARGET_SEQ_LEN_BLOCK_SIZE == 1 +/* This version is used for 2nd token */ + +REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +KERNEL(sdpa_opt)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* query_input, + const __global INPUT1_TYPE* key_input, + const __global INPUT2_TYPE* value_input, +#if HAS_ATTN_MASK_INPUT + const __global INPUT3_TYPE* attn_mask, +#endif +#if HAS_SCALE_INPUT + const __global INPUT4_TYPE* scale, +#endif + __global OUTPUT_TYPE* output, + __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums, + __global SOFTMAX_ACCUMULATOR_TYPE* max_logits, + __global OUTPUT_TYPE* tmp_out +) +{ + const uint batch_idx = get_global_id(0); + const uint b0_idx = batch_idx / NUM_HEADS; /* BATCH dim */ + const uint b1_idx = batch_idx % NUM_HEADS; /* HEADS_NUM dim */ + +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint target_seq_idx = (uint)get_global_id(1) * TARGET_SEQ_LEN_BLOCK_SIZE; +#else + const uint target_seq_idx = get_global_id(1); +#endif + const uint lid = get_local_id(2); + const uint head_size_idx = lid; + + const uint sgid = get_sub_group_id(); + const uint sglid = get_sub_group_local_id(); + + const uint partition_idx = get_group_id(2); + const uint num_of_partitions = get_num_groups(2); + const uint wi_num_per_partition = get_local_size(2); + + const uint start_partition_idx = partition_idx * SEQ_LEN_PARTITION_SIZE; + const uint partition_seq_len = + ((partition_idx + 1) < num_of_partitions) ? (SEQ_LEN_PARTITION_SIZE) + : (SOURCE_SEQ_LEN - partition_idx * SEQ_LEN_PARTITION_SIZE); + + // SLM for query inputs + __local INPUT0_TYPE query_local[HEAD_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; + // SLM for intermediate QK results + __local OUTPUT_TYPE qk_local[SEQ_LEN_PARTITION_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; + // SLM buffers for SoftMax calculation and qk_max/qk_sums results aggregation across all WG + __local SOFTMAX_ACCUMULATOR_TYPE qk_max_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE]; + __local SOFTMAX_ACCUMULATOR_TYPE qk_sum_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE]; + + { + // Gemm1 and SoftMax calculation + + SOFTMAX_ACCUMULATOR_TYPE qk_max[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_MIN}; + for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { + qk_max[i] = SOFTMAX_ACCUMULATOR_VAL_MIN; + } + + { + // Gemm1 calculation +#if HAS_SCALE_INPUT + const OUTPUT_TYPE scale_val = *scale; +#else + const OUTPUT_TYPE scale_val = OUTPUT_VAL_ONE / sqrt(TO_OUTPUT_TYPE(HEAD_SIZE)); +#endif + { + // Query input loading to SLM + #define QUERY_STEP_LOCAL SUBGROUP_SIZE * SUBGROUPS_PER_WG + uint query_local_offset = sgid * SUBGROUP_SIZE + sglid; + +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); +#else + const uint seq_idx_end = 1; +#endif +#ifdef INPUT0_DIMS_ORDER + uint query_offset = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, target_seq_idx, (sgid * SUBGROUP_SIZE)); + uint query_offset_next_seq = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, target_seq_idx + 1, (sgid * SUBGROUP_SIZE)); + const uint query_pitch = query_offset_next_seq - query_offset; +#else + uint query_offset = INPUT0_GET_INDEX(b0_idx, b1_idx, target_seq_idx, (sgid * SUBGROUP_SIZE)); + const uint query_pitch = QUERY_STEP_LOCAL; +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + #define QUERY_BLOCK_SIZE 1 + + INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset); + + query_local[query_local_offset] = val; + query_local_offset += QUERY_STEP_LOCAL; + query_offset += query_pitch; + } + #undef QUERY_BLOCK_SIZE + #undef QUERY_STEP + + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Main Gemm1 calculation loop + // Each SG performs element-wise multiplications of Q[HEAD_SIZE]xK[HEAD_SIZE] values + // HEAD_SIZE / SUBGROUPS_PER_WG times in the loop and saves the result to the qk_local SLM buffer + for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE)) { +#ifdef INPUT1_DIMS_ORDER + uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0); +#else + uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0); +#endif + + INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO}; + + uint head_idx_index = 0; + #define KEY_BLOCK_SIZE 8 + for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { + #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); + #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) + + KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + + uint query_offset = head_idx_index + sglid; + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + QUERY_BLOCK query_vals_reg; + unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { + query_vals_reg[i] = query_local[query_offset + i * SUBGROUP_SIZE]; + } + + unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { + acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]); + } + + query_offset += HEAD_SIZE; + } + } + + #define KEY_BLOCK_SIZE 4 + for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { + #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); + #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) + + KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + + uint query_offset = head_idx_index + sglid; + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + QUERY_BLOCK query_vals_reg; + unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { + query_vals_reg[i] = query_local[query_offset + i * SUBGROUP_SIZE]; + } + + unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { + acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]); + } + + query_offset += HEAD_SIZE; + } + } + + #define KEY_BLOCK_SIZE 2 + for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { + #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); + #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) + + KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + + uint query_offset = head_idx_index + sglid; + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + QUERY_BLOCK query_vals_reg; + unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { + query_vals_reg[i] = query_local[query_offset + i * SUBGROUP_SIZE]; + } + + unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { + acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]); + } + + query_offset += HEAD_SIZE; + } + } + + #define KEY_BLOCK_SIZE 1 + for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { + #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); + #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) + + KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + + uint query_offset = head_idx_index + sglid; + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + QUERY_BLOCK query_vals_reg; + unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { + query_vals_reg = query_local[query_offset + i * SUBGROUP_SIZE]; + } + + acc[seq_idx] = mad(query_vals_reg, key_vals, acc[seq_idx]); + query_offset += HEAD_SIZE; + } + } + + // Sum up all accumulators accross single SG and save result to SLM + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + acc[seq_idx] = sub_group_reduce_add(acc[seq_idx]); + qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len] = acc[seq_idx]; + } + } + + { + // Wait until all SG finishes their calculations and apply scale and attention mask to the results + barrier(CLK_LOCAL_MEM_FENCE); + + INPUT0_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); +#else + const uint seq_idx_end = 1; +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + // Iterate over all values QK values in SLM and apply scale and attention mask + for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE)) { + // Read value from SLM and apply scale + qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; + qk_val[seq_idx] *= scale_val; + + // Apply attention mask +#if IS_CAUSAL + if (start_partition_idx + seq_len > target_seq_idx + seq_idx) + qk_val[seq_idx] += INPUT0_VAL_MIN; +#elif !IS_CAUSAL && HAS_ATTN_MASK_INPUT + const uint attn_mask_offset = INPUT3_GET_INDEX_SAFE(b0_idx, b1_idx, target_seq_idx + seq_idx, start_partition_idx + seq_len); + qk_val[seq_idx] += attn_mask[attn_mask_offset]; +#endif + + // Update qk_max value + qk_max[seq_idx] = SOFTMAX_ACCUMULATOR_MAX_FUNC(qk_max[seq_idx], TO_SOFTMAX_ACCUMULATOR_TYPE(qk_val[seq_idx])); + + // Save modified qk value back to SLM + qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len] = qk_val[seq_idx]; + } + } + } + } // Gemm1 calculation end + + { + // SoftMax calculation +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); +#else + const uint seq_idx_end = 1; +#endif + // Find the maximum value of qk in the subgroup + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + qk_max[seq_idx] = sub_group_reduce_max(qk_max[seq_idx]); + } + + // Find the maximum value of qk across all subgroups in the workgroup + if (sglid == 0) { + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + qk_max_vals[seq_idx * SUBGROUPS_PER_WG + sgid] = qk_max[seq_idx]; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + qk_max[seq_idx] = SOFTMAX_ACCUMULATOR_VAL_MIN; + + if (sglid < SUBGROUPS_PER_WG) + qk_max[seq_idx] = qk_max_vals[seq_idx * SUBGROUPS_PER_WG + sglid]; + + // Final maximum value of qk after reduction across all subgroups + qk_max[seq_idx] = sub_group_reduce_max(qk_max[seq_idx]); + } + + SOFTMAX_ACCUMULATOR_TYPE exp_sum[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_ZERO}; + const uint qk_num_per_wi = CEIL_DIV(partition_seq_len, SUBGROUPS_PER_WG * SUBGROUP_SIZE); + for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) { + const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + head_size_idx; + if (local_data_idx < partition_seq_len) { + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + SOFTMAX_ACCUMULATOR_TYPE qk_new = native_exp(TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) - qk_max[seq_idx]); + qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx] = TO_OUTPUT_TYPE(qk_new); + + exp_sum[seq_idx] += qk_new; + } + } + } + + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + exp_sum[seq_idx] = sub_group_reduce_add(exp_sum[seq_idx]); + + if (sglid == 0) + qk_sum_vals[seq_idx * SUBGROUPS_PER_WG + sgid] = exp_sum[seq_idx]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + exp_sum[seq_idx] = SOFTMAX_ACCUMULATOR_VAL_ZERO; + + if (sglid < SUBGROUPS_PER_WG) + exp_sum[seq_idx] = qk_sum_vals[seq_idx * SUBGROUPS_PER_WG + sglid]; + + // Find the final sum of all exp_sum[seq_idx] values in workgroup + exp_sum[seq_idx] = sub_group_reduce_add(exp_sum[seq_idx]); + } + + // const SOFTMAX_ACCUMULATOR_TYPE inv_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ONE / exp_sum[seq_idx]; + for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) { + const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + sgid * SUBGROUP_SIZE + sglid; + if (local_data_idx < partition_seq_len) { + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + SOFTMAX_ACCUMULATOR_TYPE qk_new = TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) / exp_sum[seq_idx]; + qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx] = TO_OUTPUT_TYPE(qk_new); + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + { + // If the number of partitions is greater than 1, save exm_sums and max_logits to the temporary buffers + // Use single WI in the WG, since all the WIs have the same value + if (num_of_partitions > 1 && head_size_idx == 0) { + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + + (seq_idx + target_seq_idx) * (num_of_partitions) + + partition_idx; + exp_sums[exp_sums_offset] = exp_sum[seq_idx]; + + const uint max_logits_offset = exp_sums_offset; + max_logits[max_logits_offset] = qk_max[seq_idx]; + } + } + } + } // SoftMax calculation end + } // Gemm1 + SoftMax calculations end + + { + // Gemm2 calculation + OUTPUT_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {OUTPUT_VAL_ZERO}; + +#ifdef INPUT2_DIMS_ORDER + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0); + uint value_offset_next_seq = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0); + const uint value_pitch = value_offset_next_seq - value_offset; +#else + const uint value_pitch = HEAD_SIZE; +#endif + + for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) { +#ifdef INPUT2_DIMS_ORDER + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); +#else + uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); +#endif + + OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid]; + } + + unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { + INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]); + } + + value_offset += value_pitch; + } + } + + const uint seq_len_leftovers_start = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE; + for (uint seq_len = seq_len_leftovers_start; seq_len < partition_seq_len; seq_len++) { +#ifdef INPUT2_DIMS_ORDER + const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx); +#else + const uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, head_size_idx); +#endif + + OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; + } + + INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + acc[seq_idx] = mad(qk_val[seq_idx], value_val, acc[seq_idx]); + } + } + + // If the number of partitions is greater than 1, save results to the temporary buffer; + // otherwise, save results directly to the main output. + if (num_of_partitions > 1) { +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); +#else + const uint seq_idx_end = 1; +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + // Data layout of tmp_output buf: [batch, heads_num, q_len, partition_idx, head_size] + const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + (target_seq_idx + seq_idx) * (num_of_partitions * HEAD_SIZE) + + partition_idx * (HEAD_SIZE) + + head_size_idx; + tmp_out[tmp_out_offset] = acc[seq_idx]; + } + } else { +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); +#else + const uint seq_idx_end = 1; +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + const uint output_offset = OUTPUT_GET_INDEX(b0_idx, b1_idx, target_seq_idx + seq_idx, head_size_idx); + + output[output_offset] = acc[seq_idx]; + } + } + } // Gemm2 calculation end +} + +#else +/* This version is used for 1st token */ + +REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +KERNEL(sdpa_opt)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* query_input, + const __global INPUT1_TYPE* key_input, + const __global INPUT2_TYPE* value_input, +#if HAS_ATTN_MASK_INPUT + const __global INPUT3_TYPE* attn_mask, +#endif +#if HAS_SCALE_INPUT + const __global INPUT4_TYPE* scale, +#endif + __global OUTPUT_TYPE* output, + __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums, + __global SOFTMAX_ACCUMULATOR_TYPE* max_logits, + __global OUTPUT_TYPE* tmp_out +) +{ + const uint batch_idx = get_global_id(0); + const uint b0_idx = batch_idx / NUM_HEADS; /* BATCH dim */ + const uint b1_idx = batch_idx % NUM_HEADS; /* HEADS_NUM dim */ + +#if TARGET_SEQ_LEN_BLOCK_SIZE != 1 && TARGET_SEQ_LEN_BLOCK_SIZE != 16 + #error TARGET_SEQ_LEN_BLOCK_SIZE unexpected size +#endif + +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint target_seq_idx = (uint)get_global_id(1) * TARGET_SEQ_LEN_BLOCK_SIZE; +#else + const uint target_seq_idx = get_global_id(1); +#endif + const uint lid = get_local_id(2); + const uint head_size_idx = lid; + + const uint sgid = get_sub_group_id(); + const uint sglid = get_sub_group_local_id(); + + const uint partition_idx = get_group_id(2); + const uint num_of_partitions = get_num_groups(2); + const uint wi_num_per_partition = get_local_size(2); + + const uint start_partition_idx = partition_idx * SEQ_LEN_PARTITION_SIZE; + const uint partition_seq_len = + ((partition_idx + 1) < num_of_partitions) ? (SEQ_LEN_PARTITION_SIZE) + : (SOURCE_SEQ_LEN - partition_idx * SEQ_LEN_PARTITION_SIZE); + + const uint target_seq_len_bs = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); + + // SLM for query inputs + __local INPUT0_TYPE query_local[HEAD_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; + // SLM for intermediate QK results + __local OUTPUT_TYPE qk_local[SEQ_LEN_PARTITION_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; + // SLM buffers for SoftMax calculation and qk_max/qk_sums results aggregation across all WG + __local SOFTMAX_ACCUMULATOR_TYPE qk_max_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE]; + __local SOFTMAX_ACCUMULATOR_TYPE qk_sum_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE]; + + { + // Gemm1 and SoftMax calculation + + SOFTMAX_ACCUMULATOR_TYPE qk_max = SOFTMAX_ACCUMULATOR_VAL_MIN; + + { + // Gemm1 calculation +#if HAS_SCALE_INPUT + const OUTPUT_TYPE scale_val = *scale; +#else + const OUTPUT_TYPE scale_val = OUTPUT_VAL_ONE / sqrt(TO_OUTPUT_TYPE(HEAD_SIZE)); +#endif + { + // Load Query input to SLM and transpose it +#ifdef INPUT0_DIMS_ORDER + uint query_offset = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, target_seq_idx, (sgid * SUBGROUP_SIZE)); + uint query_offset_next_seq = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, target_seq_idx + 1, (sgid * SUBGROUP_SIZE)); + const uint query_pitch = query_offset_next_seq - query_offset; +#else + uint query_offset = INPUT0_GET_INDEX(b0_idx, b1_idx, target_seq_idx, (sgid * SUBGROUP_SIZE)); + const uint query_pitch = SUBGROUP_SIZE * SUBGROUPS_PER_WG; +#endif + uint query_local_offset = (sgid * SUBGROUP_SIZE + sglid) * TARGET_SEQ_LEN_BLOCK_SIZE; + if (target_seq_len_bs != TARGET_SEQ_LEN_BLOCK_SIZE) { + for (uint seq_idx = 0; seq_idx < target_seq_len_bs; seq_idx++) { + INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); + + query_local[query_local_offset] = val; + query_offset += query_pitch; + query_local_offset++; + } + } else { + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); + + query_local[query_local_offset] = val; + query_offset += query_pitch; + query_local_offset++; + } + } + } + + { + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Main Gemm1 calculation loop + uint seq_len = sgid * TARGET_SEQ_LEN_BLOCK_SIZE; + for (; seq_len < partition_seq_len; seq_len += SUBGROUPS_PER_WG * SUBGROUP_SIZE) { +#ifdef INPUT1_DIMS_ORDER + uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0); + uint key_offset_next_seq = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len + 1, 0); + const uint key_pitch = key_offset_next_seq - key_offset; +#else + uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0); + const uint key_pitch = HEAD_SIZE; +#endif + + INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO}; + + for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) { + #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset); + #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + + QUERY_VEC queries_vec; + uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid; + unroll_for (uint q_row_idx = 0; q_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; q_row_idx++) { + queries_vec[q_row_idx] = query_local[query_local_offset]; + query_local_offset += TARGET_SEQ_LEN_BLOCK_SIZE; + } + + unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) { + INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); + + unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { + acc[key_row_idx] = mad(sub_group_broadcast(key_vals, i), queries_vec[i], acc[key_row_idx]); + } + } + } + + { +#if !IS_CAUSAL && HAS_ATTN_MASK_INPUT + const uint attn_mask_offset = INPUT3_GET_INDEX_SAFE(b0_idx, b1_idx, target_seq_idx + sglid, start_partition_idx + seq_len); + MAKE_VECTOR_TYPE(INPUT3_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) attn_mask_vec = INPUT3_VAL_MIN; + for (uint i = 0; i < min(partition_seq_len - seq_len, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); i++) { + attn_mask_vec[i] = attn_mask[attn_mask_offset + i]; + } +#endif + unroll_for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { + acc[i] *= scale_val; +#if IS_CAUSAL + if (start_partition_idx + seq_len + i > target_seq_idx + sglid) + acc[i] += INPUT0_VAL_MIN; +#elif !IS_CAUSAL && HAS_ATTN_MASK_INPUT + acc[i] += attn_mask_vec[i]; +#endif +#if INPUT0_TYPE_SIZE == 2 + /* Adding this clamp improves performance for some reason */ + acc[i] = SOFTMAX_ACCUMULATOR_MIN_FUNC(SOFTMAX_ACCUMULATOR_MAX_FUNC(acc[i], INPUT0_VAL_MIN), INPUT0_VAL_MAX); +#endif + if (seq_len + i >= partition_seq_len) { + acc[i] = INPUT0_VAL_MIN; + } + + qk_max = SOFTMAX_ACCUMULATOR_MAX_FUNC(qk_max, TO_SOFTMAX_ACCUMULATOR_TYPE(acc[i])); + qk_local[sglid * SEQ_LEN_PARTITION_SIZE + seq_len + i] = acc[i]; + } + } + } + } // Gemm1 calculation end + + { + // Save QK max to SLM + qk_max_vals[sglid * SUBGROUPS_PER_WG + sgid] = qk_max; + } + + { + // SoftMax calculation +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = target_seq_len_bs; +#else + const uint seq_idx_end = 1; +#endif + #define QK_MAX_NUMS_PER_SG CEIL_DIV(TARGET_SEQ_LEN_BLOCK_SIZE, SUBGROUPS_PER_WG) + #if (TARGET_SEQ_LEN_BLOCK_SIZE % SUBGROUPS_PER_WG != 0) + /* /* If TARGET_SEQ_LEN_BLOCK_SIZE is not divisible by SUBGROUPS_PER_WG, then some subgroups will have to handle more QK rows than others */ + #define QK_ITERS_END \ + (TARGET_SEQ_LEN_BLOCK_SIZE / SUBGROUPS_PER_WG + (sgid < TARGET_SEQ_LEN_BLOCK_SIZE % SUBGROUPS_PER_WG ? 1 : 0)) + #else + #define QK_ITERS_END QK_MAX_NUMS_PER_SG + #endif + + OUTPUT_TYPE qk_max[QK_MAX_NUMS_PER_SG]; + for (uint i = 0; i < QK_MAX_NUMS_PER_SG; i++) + qk_max[i] = SOFTMAX_ACCUMULATOR_VAL_MIN; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (sglid < SUBGROUPS_PER_WG) + for (uint i = 0; i < QK_ITERS_END; i++) + qk_max[i] = qk_max_vals[(i * SUBGROUPS_PER_WG * SUBGROUPS_PER_WG) + sgid * SUBGROUPS_PER_WG + sglid]; + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + for (uint i = 0; i < QK_ITERS_END; i++) { + qk_max[i] = sub_group_reduce_max(qk_max[i]); + } + + SOFTMAX_ACCUMULATOR_TYPE exp_sum[QK_MAX_NUMS_PER_SG]; + for (uint i = 0; i < QK_MAX_NUMS_PER_SG; i++) + exp_sum[i] = SOFTMAX_ACCUMULATOR_VAL_ZERO; + + for (uint i = 0; i < QK_ITERS_END; i++) { + // TODO: Try full loop, with ternary operator + for (uint qk_idx = sglid; qk_idx < partition_seq_len; qk_idx += SUBGROUP_SIZE) { + const uint qk_offset = i * SUBGROUPS_PER_WG * SEQ_LEN_PARTITION_SIZE + sgid * SEQ_LEN_PARTITION_SIZE + qk_idx; + SOFTMAX_ACCUMULATOR_TYPE qk_val = qk_local[qk_offset]; + SOFTMAX_ACCUMULATOR_TYPE qk_new = native_exp(TO_SOFTMAX_ACCUMULATOR_TYPE(qk_val) - qk_max[i]); + qk_local[qk_offset] = qk_new; + exp_sum[i] += qk_new; + } + } + + for (uint i = 0; i < QK_ITERS_END; i++) { + exp_sum[i] = sub_group_reduce_add(exp_sum[i]); + } + + for (uint i = 0; i < QK_ITERS_END; i++) { + for (uint qk_idx = sglid; qk_idx < partition_seq_len; qk_idx += SUBGROUP_SIZE) { + const uint qk_offset = i * SUBGROUPS_PER_WG * SEQ_LEN_PARTITION_SIZE + sgid * SEQ_LEN_PARTITION_SIZE + qk_idx; + SOFTMAX_ACCUMULATOR_TYPE qk_val = TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[qk_offset]); + SOFTMAX_ACCUMULATOR_TYPE qk_new = qk_val / exp_sum[i]; + qk_local[qk_offset] = qk_new; + } + } + + { + // If the number of partitions is greater than 1, save exm_sums and max_logits to the temporary buffers + // Use single WI in the WG, since all the WIs have the same value + if (num_of_partitions > 1 && sglid == 0) { + for (uint i = 0; i < QK_MAX_NUMS_PER_SG; i++) { + if (target_seq_idx + sgid + (i * SUBGROUPS_PER_WG) < TARGET_SEQ_LEN) { + const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + + (target_seq_idx + sgid + (i * SUBGROUPS_PER_WG)) * (num_of_partitions) + + partition_idx; + exp_sums[exp_sums_offset] = exp_sum[i]; + + const uint max_logits_offset = exp_sums_offset; + max_logits[max_logits_offset] = qk_max[i]; + } + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + } // SoftMax calculation end + } // Gemm1 + SoftMax calculations end + + const uint seq_len_leftovers_start = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE; + if (seq_len_leftovers_start != partition_seq_len) { + // Gemm2 calculation + OUTPUT_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {OUTPUT_VAL_ZERO}; + +#ifdef INPUT2_DIMS_ORDER + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0); + uint value_offset_next_seq = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0); + const uint value_pitch = value_offset_next_seq - value_offset; +#else + const uint value_pitch = HEAD_SIZE; +#endif + + for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) { +#ifdef INPUT2_DIMS_ORDER + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); +#else + uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); +#endif + + OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid]; + } + + unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { + INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]); + } + + value_offset += value_pitch; + } + } + + + /* The handling of leftovers causes significantly worse assembly code generation for the above main calculation loop. + Therefore, there are two independent branches for the calculation of QK*V matrices: + one with leftovers handling (when seq_len_leftovers_start != partition_seq_len) and one without. */ + { + OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; + uint qk_offset = min(seq_len_leftovers_start + sglid, partition_seq_len); + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + qk_val[seq_idx] = qk_local[qk_offset]; + qk_offset += SEQ_LEN_PARTITION_SIZE; + } + + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start, head_size_idx); + + for (uint seq_len_idx = 0; seq_len_idx < partition_seq_len - seq_len_leftovers_start; seq_len_idx++) { + INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + + for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], seq_len_idx), value_val, acc[seq_idx]); + } + + value_offset += value_pitch; + } + } + + // If the number of partitions is greater than 1, save results to the temporary buffer; + // otherwise, save results directly to the main output. + if (num_of_partitions > 1) { +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); +#else + const uint seq_idx_end = 1; +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + // Data layout of tmp_output buf: [batch, heads_num, q_len, partition_idx, head_size] + const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + (target_seq_idx + seq_idx) * (num_of_partitions * HEAD_SIZE) + + partition_idx * (HEAD_SIZE) + + head_size_idx; + + tmp_out[tmp_out_offset] = acc[seq_idx]; + } + } else { +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); +#else + const uint seq_idx_end = 1; +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + const uint output_offset = OUTPUT_GET_INDEX(b0_idx, b1_idx, target_seq_idx + seq_idx, head_size_idx); + + output[output_offset] = acc[seq_idx]; + } + } + } else { + // Gemm2 calculation + OUTPUT_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {OUTPUT_VAL_ZERO}; + +#ifdef INPUT2_DIMS_ORDER + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0); + uint value_offset_next_seq = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0); + const uint value_pitch = value_offset_next_seq - value_offset; +#else + const uint value_pitch = HEAD_SIZE; +#endif + + for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) { +#ifdef INPUT2_DIMS_ORDER + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); +#else + uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); +#endif + + OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid]; + } + + unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { + INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { + acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]); + } + + value_offset += value_pitch; + } + } + + // If the number of partitions is greater than 1, save results to the temporary buffer; + // otherwise, save results directly to the main output. + if (num_of_partitions > 1) { +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); +#else + const uint seq_idx_end = 1; +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + // Data layout of tmp_output buf: [batch, heads_num, q_len, partition_idx, head_size] + const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + (target_seq_idx + seq_idx) * (num_of_partitions * HEAD_SIZE) + + partition_idx * (HEAD_SIZE) + + head_size_idx; + tmp_out[tmp_out_offset] = acc[seq_idx]; + } + } else { +#if TARGET_SEQ_LEN_BLOCK_SIZE > 1 + const uint seq_idx_end = min(TARGET_SEQ_LEN - target_seq_idx, (uint)TARGET_SEQ_LEN_BLOCK_SIZE); +#else + const uint seq_idx_end = 1; +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + const uint output_offset = OUTPUT_GET_INDEX(b0_idx, b1_idx, target_seq_idx + seq_idx, head_size_idx); + + output[output_offset] = acc[seq_idx]; + } + } + } // Gemm2 calculation end +} + +#endif // TARGET_SEQ_LEN_BLOCK_SIZE != 1 + +#endif // SDPA_STAGE_0 + +#ifdef SDPA_STAGE_1 + +// MTL iGPU faces high register pressure issue with a higher number of REG_VERSION_MAX_VALUES_PER_WI. +// To mitigate this, add an additional level of SDPA results processing +// with lower register pressure (REG_VERSION_MAX_VALUES_PER_WI_LOWER). + +#if SOFTMAX_ACCUMULATOR_TYPE_SIZE == 4 +#define REG_VERSION_MAX_VALUES_PER_WI 24 +#define REG_VERSION_MAX_VALUES_PER_WI_LOWER 8 +#elif SOFTMAX_ACCUMULATOR_TYPE_SIZE == 2 +#define REG_VERSION_MAX_VALUES_PER_WI 48 +#define REG_VERSION_MAX_VALUES_PER_WI_LOWER 16 +#else +#error Unexpected SOFTMAX_ACCUMULATOR data type size +#endif + +// query_input [batch, heads_num, q_len, head_size] +// key_input [batch, kv_heads_num, kv_len, head_size] +// value_input [batch, kv_heads_num, kv_len, head_size] +// attn_mask [1, 1, q_len, kv_len] +// output [batch, heads_num, q_len, head_size] +// exp_sums [batch, heads_num, q_len, partition_idx] +// max_logits [batch, heads_num, q_len, partition_idx] +// tmp_out [batch, heads_num, q_len, partition_idx, head_size] + +REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +KERNEL(sdpa_opt_finalization_stage)( + OPTIONAL_SHAPE_INFO_ARG + __global OUTPUT_TYPE* output, + const __global SOFTMAX_ACCUMULATOR_TYPE* exp_sums, + const __global SOFTMAX_ACCUMULATOR_TYPE* max_logits, + const __global OUTPUT_TYPE* tmp_out, + const uint num_of_partitions) { + const uint batch_idx = get_global_id(0); + const uint b0_idx = batch_idx / NUM_HEADS; + const uint b1_idx = batch_idx % NUM_HEADS; + const uint target_seq_idx = get_global_id(1); + const uint sglid = get_sub_group_local_id(); + + if (num_of_partitions <= SUBGROUP_SIZE * REG_VERSION_MAX_VALUES_PER_WI_LOWER) { + /* Registers kernel version, can handle up to SEQ_LEN_PARTITION_SIZE(256) * SUBGROUP_SIZE(16) * REG_VERSION_MAX_VALUES_PER_WI_LOWER(8/16) = 32768/65536 tokens */ + SOFTMAX_ACCUMULATOR_TYPE exp_sum[REG_VERSION_MAX_VALUES_PER_WI_LOWER] = {SOFTMAX_ACCUMULATOR_VAL_ZERO}; + SOFTMAX_ACCUMULATOR_TYPE max_logit[REG_VERSION_MAX_VALUES_PER_WI_LOWER] = {SOFTMAX_ACCUMULATOR_VAL_MIN}; + SOFTMAX_ACCUMULATOR_TYPE local_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO; + SOFTMAX_ACCUMULATOR_TYPE local_max_logit = SOFTMAX_ACCUMULATOR_VAL_MIN; + + const uint iters_num = CEIL_DIV(num_of_partitions, SUBGROUP_SIZE); + for (uint i = 0; i < iters_num; i++) { + const uint partition_idx = i * SUBGROUP_SIZE + sglid; + const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + + target_seq_idx * (num_of_partitions) + + partition_idx; + const uint max_logit_offset = exp_sums_offset; + + if (partition_idx < num_of_partitions) { + exp_sum[i] = exp_sums[exp_sums_offset]; + max_logit[i] = max_logits[max_logit_offset]; + local_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(local_max_logit, max_logit[i]); + } + } + + SOFTMAX_ACCUMULATOR_TYPE global_max = sub_group_reduce_max(local_max_logit); + + // Update exp_sum with respect to the global maximum + for (uint i = 0; i < iters_num; i++) { + const uint partition_idx = i * SUBGROUP_SIZE + sglid; + if (partition_idx < num_of_partitions) { + exp_sum[i] = exp_sum[i] * native_exp(max_logit[i] - global_max); + local_exp_sum += exp_sum[i]; + } + } + + SOFTMAX_ACCUMULATOR_TYPE global_sum = sub_group_reduce_add(local_exp_sum); + + for (uint head_size_idx = 0; head_size_idx < HEAD_SIZE / SUBGROUP_SIZE; head_size_idx++) { + SOFTMAX_ACCUMULATOR_TYPE acc = 0.0f; + for (uint partition_idx = 0; partition_idx < num_of_partitions; partition_idx++) { + const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + target_seq_idx * (num_of_partitions * HEAD_SIZE) + + partition_idx * (HEAD_SIZE) + + (head_size_idx * SUBGROUP_SIZE + sglid); + OUTPUT_TYPE out_val = tmp_out[tmp_out_offset]; + acc += TO_SOFTMAX_ACCUMULATOR_TYPE(out_val) * + TO_SOFTMAX_ACCUMULATOR_TYPE(sub_group_broadcast(exp_sum[partition_idx / SUBGROUP_SIZE], partition_idx % SUBGROUP_SIZE)) / + TO_SOFTMAX_ACCUMULATOR_TYPE(global_sum); + } + const uint out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * HEAD_SIZE) + + b1_idx * (TARGET_SEQ_LEN * HEAD_SIZE) + + target_seq_idx * (HEAD_SIZE) + + (head_size_idx * SUBGROUP_SIZE + sglid); + + output[out_offset] = TO_OUTPUT_TYPE(acc); + } + } else if (num_of_partitions <= SUBGROUP_SIZE * REG_VERSION_MAX_VALUES_PER_WI) { + /* Registers kernel version, can handle up to SEQ_LEN_PARTITION_SIZE(256) * SUBGROUP_SIZE(16) * REG_VERSION_MAX_VALUES_PER_WI(24/48) = 98304/196608 tokens */ + SOFTMAX_ACCUMULATOR_TYPE exp_sum[REG_VERSION_MAX_VALUES_PER_WI] = {SOFTMAX_ACCUMULATOR_VAL_ZERO}; + SOFTMAX_ACCUMULATOR_TYPE max_logit[REG_VERSION_MAX_VALUES_PER_WI] = {SOFTMAX_ACCUMULATOR_VAL_MIN}; + SOFTMAX_ACCUMULATOR_TYPE local_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO; + SOFTMAX_ACCUMULATOR_TYPE local_max_logit = SOFTMAX_ACCUMULATOR_VAL_MIN; + + const uint iters_num = CEIL_DIV(num_of_partitions, SUBGROUP_SIZE); + for (uint i = 0; i < iters_num; i++) { + const uint partition_idx = i * SUBGROUP_SIZE + sglid; + const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + + target_seq_idx * (num_of_partitions) + + partition_idx; + const uint max_logit_offset = exp_sums_offset; + + if (partition_idx < num_of_partitions) { + exp_sum[i] = exp_sums[exp_sums_offset]; + max_logit[i] = max_logits[max_logit_offset]; + local_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(local_max_logit, max_logit[i]); + } + } + + SOFTMAX_ACCUMULATOR_TYPE global_max = sub_group_reduce_max(local_max_logit); + + // Update exp_sum with respect to the global maximum + for (uint i = 0; i < iters_num; i++) { + const uint partition_idx = i * SUBGROUP_SIZE + sglid; + if (partition_idx < num_of_partitions) { + exp_sum[i] = exp_sum[i] * native_exp(max_logit[i] - global_max); + local_exp_sum += exp_sum[i]; + } + } + + SOFTMAX_ACCUMULATOR_TYPE global_sum = sub_group_reduce_add(local_exp_sum); + + for (uint head_size_idx = 0; head_size_idx < HEAD_SIZE / SUBGROUP_SIZE; head_size_idx++) { + SOFTMAX_ACCUMULATOR_TYPE acc = 0.0f; + for (uint partition_idx = 0; partition_idx < num_of_partitions; partition_idx++) { + const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + target_seq_idx * (num_of_partitions * HEAD_SIZE) + + partition_idx * (HEAD_SIZE) + + (head_size_idx * SUBGROUP_SIZE + sglid); + OUTPUT_TYPE out_val = tmp_out[tmp_out_offset]; + acc += TO_SOFTMAX_ACCUMULATOR_TYPE(out_val) * + TO_SOFTMAX_ACCUMULATOR_TYPE(sub_group_broadcast(exp_sum[partition_idx / SUBGROUP_SIZE], partition_idx % SUBGROUP_SIZE)) / + TO_SOFTMAX_ACCUMULATOR_TYPE(global_sum); + } + const uint out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * HEAD_SIZE) + + b1_idx * (TARGET_SEQ_LEN * HEAD_SIZE) + + target_seq_idx * (HEAD_SIZE) + + (head_size_idx * SUBGROUP_SIZE + sglid); + + output[out_offset] = TO_OUTPUT_TYPE(acc); + } + } else { + /* Global memory kernel version, can handle any number of tokens, but could be very slow. */ + SOFTMAX_ACCUMULATOR_TYPE local_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ZERO; + SOFTMAX_ACCUMULATOR_TYPE local_max_logit = SOFTMAX_ACCUMULATOR_VAL_MIN; + + const uint iters_num = CEIL_DIV(num_of_partitions, SUBGROUP_SIZE); + for (uint i = 0; i < iters_num; i++) { + const uint partition_idx = i * SUBGROUP_SIZE + sglid; + const uint max_logit_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + + target_seq_idx * (num_of_partitions) + + partition_idx; + + + if (partition_idx < num_of_partitions) { + local_max_logit = SOFTMAX_ACCUMULATOR_MAX_FUNC(local_max_logit, max_logits[max_logit_offset]); + } + } + + SOFTMAX_ACCUMULATOR_TYPE global_max = sub_group_reduce_max(local_max_logit); + + // Calculate global sum + for (uint i = 0; i < iters_num; i++) { + const uint partition_idx = i * SUBGROUP_SIZE + sglid; + const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + + target_seq_idx * (num_of_partitions) + + partition_idx; + const uint max_logit_offset = exp_sums_offset; + + if (partition_idx < num_of_partitions) { + local_exp_sum += exp_sums[exp_sums_offset] * native_exp(max_logits[max_logit_offset] - global_max); + } + } + + SOFTMAX_ACCUMULATOR_TYPE global_sum = sub_group_reduce_add(local_exp_sum); + + for (uint head_size_idx = 0; head_size_idx < HEAD_SIZE / SUBGROUP_SIZE; head_size_idx++) { + SOFTMAX_ACCUMULATOR_TYPE acc = 0.0f; + for (uint partition_idx = 0; partition_idx < num_of_partitions; partition_idx++) { + const uint tmp_out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions * HEAD_SIZE) + + target_seq_idx * (num_of_partitions * HEAD_SIZE) + + partition_idx * (HEAD_SIZE) + + (head_size_idx * SUBGROUP_SIZE + sglid); + + const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + + target_seq_idx * (num_of_partitions) + + partition_idx; + const uint max_logit_offset = exp_sums_offset; + + SOFTMAX_ACCUMULATOR_TYPE new_exp_sum = exp_sums[exp_sums_offset] * native_exp(max_logits[max_logit_offset] - global_max); + + OUTPUT_TYPE out_val = tmp_out[tmp_out_offset]; + acc += TO_SOFTMAX_ACCUMULATOR_TYPE(out_val) * new_exp_sum / TO_SOFTMAX_ACCUMULATOR_TYPE(global_sum); + } + + const uint out_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * HEAD_SIZE) + + b1_idx * (TARGET_SEQ_LEN * HEAD_SIZE) + + target_seq_idx * (HEAD_SIZE) + + (head_size_idx * SUBGROUP_SIZE + sglid); + + output[out_offset] = TO_OUTPUT_TYPE(acc); + } + } +} + +#endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl new file mode 100644 index 00000000000000..cd289be026e7e3 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl @@ -0,0 +1,212 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +// query_input [batch, heads_num, q_len, head_size] +// key_input [batch, kv_heads_num, kv_len, head_size] +// value_input [batch, kv_heads_num, kv_len, head_size] +// attn_mask [1, 1, q_len, kv_len] +// output [batch, heads_num, q_len, head_size] +// tmp_buf [batch, heads_num, q_len, kv_len] + +// When handling long sequences and executing in FP16, accuracy can significantly vary based on two factors: +// 1) The order of scale application (which can be controlled using the APPLY_SCALE_TO_QUERY macro) +// 2) The type of SoftMax accumulator + +inline uint FUNC(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if INPUT0_SIMPLE + return GET_DATA_INDEX_6D_SAFE(INPUT0, b, f, w, z, y, x); +#else +#if INPUT0_DIMS == 4 + return INPUT0_GET_INDEX_SAFE(b, f, y, x); +#elif INPUT0_DIMS == 5 + return INPUT0_GET_INDEX_SAFE(b, f, z, y, x); +#elif INPUT0_DIMS == 6 + return INPUT0_GET_INDEX_SAFE(b, f, w, z, y, x); +#else +# error sdpa_ref.cl : Unsupported input 0 format +#endif +#endif +} + +inline uint FUNC(get_input0_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef INPUT0_DIMS_ORDER + return FUNC_CALL(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT0_DIMS_ORDER); +#else + return FUNC_CALL(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x); +#endif +} + +inline uint FUNC(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef DO_BROADCAST_KEY_VALUE + DO_BROADCAST_KEY_VALUE; +#endif +#if INPUT1_SIMPLE + return GET_DATA_INDEX_6D_SAFE(INPUT1, b, f, w, z, y, x); +#else +#if INPUT1_DIMS == 4 + return INPUT1_GET_INDEX_SAFE(b, f, y, x); +#elif INPUT1_DIMS == 5 + return INPUT1_GET_INDEX_SAFE(b, f, z, y, x); +#elif INPUT1_DIMS == 6 + return INPUT1_GET_INDEX_SAFE(b, f, w, z, y, x); +#else +# error sdpa_ref.cl : Unsupported input 1 format +#endif +#endif +} + +inline uint FUNC(get_input1_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef INPUT1_DIMS_ORDER + return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT1_DIMS_ORDER); +#else + return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x); +#endif +} + +inline uint FUNC(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef DO_BROADCAST_KEY_VALUE + DO_BROADCAST_KEY_VALUE; +#endif +#if INPUT2_SIMPLE + return GET_DATA_INDEX_6D_SAFE(INPUT2, b, f, w, z, y, x); +#else +#if INPUT2_DIMS == 4 + return INPUT2_GET_INDEX_SAFE(b, f, y, x); +#elif INPUT2_DIMS == 5 + return INPUT2_GET_INDEX_SAFE(b, f, z, y, x); +#elif INPUT2_DIMS == 6 + return INPUT2_GET_INDEX_SAFE(b, f, w, z, y, x); +#else +# error sdpa_ref.cl : Unsupported input 1 format +#endif +#endif +} + +inline uint FUNC(get_input2_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#ifdef INPUT2_DIMS_ORDER + return FUNC_CALL(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT2_DIMS_ORDER); +#else + return FUNC_CALL(get_input2_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x); +#endif +} + +#define APPLY_SCALE_TO_QUERY 1 + +KERNEL(sdpa_ref)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* query_input, + const __global INPUT1_TYPE* key_input, + const __global INPUT2_TYPE* value_input, +#if HAS_ATTN_MASK_INPUT + const __global INPUT3_TYPE* attn_mask, +#endif +#if HAS_SCALE_INPUT + const __global INPUT4_TYPE* scale, +#endif + __global OUTPUT_TYPE* output, + __global OUTPUT_TYPE* tmp_buf +) +{ + const uint batch_idx = get_global_id(0); + const uint b0 = batch_idx / NUM_HEADS; /* BATCH dim */ + const uint b1 = batch_idx % NUM_HEADS; /* HEADS_NUM dim */ + const uint target_seq_idx = get_global_id(1); + const uint head_size_idx = get_global_id(2); + +#if HAS_SCALE_INPUT + const OUTPUT_TYPE scale_val = *scale; +#else + const OUTPUT_TYPE scale_val = OUTPUT_VAL_ONE / sqrt(TO_OUTPUT_TYPE(INPUT1_SIZE_X)); +#endif + + // Process 1*seq_len elements (Gemm1 + SoftMax) using a single work item, saving results to tmp_buf and + // reusing them between all work items within a single workgroup for Gemm2 calculations. + if (get_local_id(2) == 0) { + for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) { + OUTPUT_TYPE acc = 0; + for (uint h = 0; h < HEAD_SIZE /* head_size */; h++) { + uint query_offset = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0, b1, 0, 0, target_seq_idx, h); + uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0, b1, 0, 0, s, h); + +#if APPLY_SCALE_TO_QUERY + INPUT0_TYPE q_val = query_input[query_offset] * scale_val; +#else + INPUT0_TYPE q_val = query_input[query_offset]; +#endif + INPUT1_TYPE k_val = key_input[key_offset]; + acc += q_val * k_val; + } + +#if !APPLY_SCALE_TO_QUERY + acc *= scale_val; +#endif + + uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + target_seq_idx * (SOURCE_SEQ_LEN) + s; + tmp_buf[tmp_buf_offset] = acc; + } + + ACCUMULATOR_TYPE qk_max = ACCUMULATOR_VAL_MIN; + for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) { + uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + target_seq_idx * (SOURCE_SEQ_LEN) + s; +#if IS_CAUSAL + OUTPUT_TYPE attn_mask_val = s > target_seq_idx ? OUTPUT_VAL_MIN : 0; +#elif !IS_CAUSAL && HAS_ATTN_MASK_INPUT + uint attn_mask_offset = INPUT3_GET_INDEX_SAFE(b0, b1, target_seq_idx, s); + OUTPUT_TYPE attn_mask_val = attn_mask[attn_mask_offset]; +#else + OUTPUT_TYPE attn_mask_val = OUTPUT_VAL_ZERO; +#endif + + OUTPUT_TYPE qk_val = tmp_buf[tmp_buf_offset] + attn_mask_val; + tmp_buf[tmp_buf_offset] = qk_val; + + qk_max = ACCUMULATOR_MAX_FUNC(qk_max, TO_ACCUMULATOR_TYPE(qk_val)); + } + + ACCUMULATOR_TYPE exp_sum = ACCUMULATOR_VAL_ZERO; + for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) { + uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + target_seq_idx * (SOURCE_SEQ_LEN) + s; + + OUTPUT_TYPE qk_val = tmp_buf[tmp_buf_offset]; + ACCUMULATOR_TYPE val = native_exp(TO_ACCUMULATOR_TYPE(qk_val) - qk_max); + exp_sum += val; + + tmp_buf[tmp_buf_offset] = TO_OUTPUT_TYPE(val); + } + + const ACCUMULATOR_TYPE inv_sum = ACCUMULATOR_VAL_ONE / exp_sum; + for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) { + uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + target_seq_idx * (SOURCE_SEQ_LEN) + s; + + OUTPUT_TYPE qk_val = tmp_buf[tmp_buf_offset]; + ACCUMULATOR_TYPE val = TO_ACCUMULATOR_TYPE(qk_val) * inv_sum; + tmp_buf[tmp_buf_offset] = TO_OUTPUT_TYPE(val); + } + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + OUTPUT_TYPE acc = 0; + for (uint s = 0; s < SOURCE_SEQ_LEN /* seq_len */; s++) { + uint tmp_buf_offset = b0 * (NUM_HEADS * TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + b1 * (TARGET_SEQ_LEN * SOURCE_SEQ_LEN) + + target_seq_idx * (SOURCE_SEQ_LEN) + s; + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0, b1, 0, 0, s, head_size_idx); + + acc += tmp_buf[tmp_buf_offset] * value_input[value_offset]; + } + + uint output_offset = OUTPUT_GET_INDEX(b0, b1, target_seq_idx, head_size_idx); + output[output_offset] = acc; +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h index c2a4ef1653472d..768a0fc3c4f854 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h @@ -59,6 +59,7 @@ enum class KernelType { DEPTH_TO_SPACE, BATCH_TO_SPACE, SHAPE_OF, + SDPA, SHUFFLE_CHANNELS, SLICE, STRIDED_SLICE, diff --git a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp index 084ae71e42732c..fcd35d13a3639b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp @@ -326,8 +326,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const { JitDefinitions baseDefinitions = TensorBaseTJitConstant::GetDefinitions(_tensor); JitDefinitions definitions{}; - DimensionAccessHelper dims(_tensor); - DimensionAccessHelper dims_padded(_tensor, true); + DimensionAccessHelperJit dims(_tensor); + DimensionAccessHelperJit dims_padded(_tensor, true); // shape_info layout // if only y has dynamic padding: // [dim_b, dim_f, dim_v, dim_u, dim_w, dim_z, dim_y, dim_x, pad_before_y, pad_after_y] diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h index e3e5f3dcc47a2d..2c8256b8551b89 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h @@ -11,9 +11,9 @@ namespace kernel_selector { struct weight_bias_params; struct WeightsReorderParams; -struct DimensionAccessHelper { - explicit DimensionAccessHelper(const DataTensor& t, bool padded = false) { - std::vector dims = { +struct DimensionAccessHelperBase { + explicit DimensionAccessHelperBase(const DataTensor& t) { + dims = { t.Batch(), t.Feature(), t.U(), @@ -23,6 +23,23 @@ struct DimensionAccessHelper { t.Y(), t.X(), }; + } + + Tensor::Dim& x_dim() { return dims[7]; } + Tensor::Dim& y_dim() { return dims[6]; } + Tensor::Dim& z_dim() { return dims[5]; } + Tensor::Dim& w_dim() { return dims[4]; } + Tensor::Dim& v_dim() { return dims[3]; } + Tensor::Dim& u_dim() { return dims[2]; } + Tensor::Dim& f_dim() { return dims[1]; } + Tensor::Dim& b_dim() { return dims[0]; } + + std::vector dims; +}; + +struct DimensionAccessHelperJit : virtual DimensionAccessHelperBase { + explicit DimensionAccessHelperJit(const DataTensor& t, bool padded = false) + : DimensionAccessHelperBase(t) { size_t dyn_shape_offset = t.get_dynamic_shape_offset(); size_t dyn_pad_offset = dyn_shape_offset + DataTensor::max_rank(); for (auto d : dims) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp index 2d878e4a9f28e1..ecb6be6f17020d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp @@ -22,7 +22,7 @@ size_t getOperationNumber(const arg_max_min_params& params) { std::string getOperationNumberString(const arg_max_min_params& params) { const auto& output = params.outputs[0]; - DimensionAccessHelper dims(output); + DimensionAccessHelperJit dims(output); switch (params.argMaxMinAxis) { case ArgMaxMinAxis::BATCH: return toVectorMulString({dims.x(), dims.y(), dims.z(), dims.f()}); case ArgMaxMinAxis::FEATURE: return toVectorMulString({dims.x(), dims.y(), dims.z(), dims.b()}); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp index 07734e85b9dd4a..cdd8d7fc56e39e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp @@ -15,7 +15,7 @@ JitConstants FullyConnectedKernelBase::GetJitConstants(const fully_connected_par JitConstants jit = WeightBiasKernelBase::GetJitConstants(params); const auto& input = params.inputs[0]; if (input.is_dynamic()) { - DimensionAccessHelper dims(input); + DimensionAccessHelperJit dims(input); jit.AddConstant(MakeJitConstant("INPUT0_ELEMENTS_COUNT", toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f()}))); } else { const auto x_size = input.LogicalSize() / input.Batch().v; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index 2e804085939732..e59f424e5d6af7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -135,10 +135,10 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons jit.Merge(MakeTypeJitConstants(params.inputs[0].GetDType(), "ACCUMULATOR")); if (params.has_dynamic_tensors()) { - DimensionAccessHelper dims0(params.inputs[0]); - DimensionAccessHelper dims1(params.inputs[1]); - DimensionAccessHelper dims0_padded(params.inputs[0], true); - DimensionAccessHelper dims1_padded(params.inputs[1], true); + DimensionAccessHelperJit dims0(params.inputs[0]); + DimensionAccessHelperJit dims1(params.inputs[1]); + DimensionAccessHelperJit dims0_padded(params.inputs[0], true); + DimensionAccessHelperJit dims1_padded(params.inputs[1], true); // Note: Actually currently this kernel is not being selected if it is shape agnostic impl && transposed inputs // Because we cannot get the original rank auto input0_dims = ConvTo8dims(params.input0_order); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/mvn/mvn_kernel_bfyx_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/mvn/mvn_kernel_bfyx_opt.cpp index 806bb90ba67b43..923bd98814a46f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/mvn/mvn_kernel_bfyx_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/mvn/mvn_kernel_bfyx_opt.cpp @@ -80,7 +80,7 @@ JitConstants MVNKernelBfyxOpt::GetJitConstants(const mvn_params& params, MVNKern if (params.has_dynamic_tensors()) { const auto& input = params.inputs[0]; - DimensionAccessHelper dims(input); + DimensionAccessHelperJit dims(input); std::string data_set_size; std::string data_set_count; if (params.mvnMode == MVNMode::WITHIN_CHANNELS) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/count_nonzero_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/count_nonzero_kernel_ref.cpp index d3132e4357fa07..7e6c1397b988e4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/count_nonzero_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/count_nonzero_kernel_ref.cpp @@ -76,7 +76,7 @@ KernelsData CountNonzeroKernelRef::GetKernelsData(const Params& params) const { auto cldnn_jit = MakeBaseParamsJitConstants(newParams); if (newParams.has_dynamic_tensors()) { const auto& input = newParams.inputs[0]; - DimensionAccessHelper dims(input); + DimensionAccessHelperJit dims(input); const std::string total_data_size = toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()}); cldnn_jit.AddConstants({MakeJitConstant("DATA_SIZE", total_data_size)}); } else { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/gather_nonzero_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/gather_nonzero_kernel_ref.cpp index 0672566e0ed2ad..bac2237893bef3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/gather_nonzero_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_zero/gather_nonzero_kernel_ref.cpp @@ -46,7 +46,7 @@ JitConstants GatherNonzeroKernelRef::GetJitConstants(const gather_nonzero_params jit.AddConstant(MakeJitConstant("MAX_LOCAL_MEM_SIZE", max_local_mem_size)); if (input.is_dynamic()) { - DimensionAccessHelper dims(input); + DimensionAccessHelperJit dims(input); const std::string total_data_size = toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()}); jit.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", total_data_size)); } else { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_tile_8x8_4x4.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_tile_8x8_4x4.cpp index 13eb399ef8ef4d..06ee5a2bc4b6ef 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_tile_8x8_4x4.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_tile_8x8_4x4.cpp @@ -107,7 +107,7 @@ static inline std::string GetTiledOutputOrder(const permute_params& params) { std::string out_z_str = ""; const auto& output = params.outputs[0]; if (params.has_dynamic_outputs()) { - DimensionAccessHelper dims(output); + DimensionAccessHelperJit dims(output); out_y_str = dims.y(); out_z_str = dims.z(); } else { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp index 80e16939bab248..318daac3b5b30e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp @@ -30,7 +30,7 @@ JitConstants ReduceKernelBase::GetJitConstants(const reduce_params& params) cons const auto& output = params.outputs[0]; if (output.is_dynamic()) { - DimensionAccessHelper dims(output); + DimensionAccessHelperJit dims(output); jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", toVectorMulString({dims.x(), dims.y(), dims.z(), diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp index db5e8c6beb1588..15043ef2624053 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp @@ -30,7 +30,7 @@ JitConstants RMSKernelBfyxOpt::GetJitConstants(const rms_params& params, Dispatc if (params.has_dynamic_tensors()) { const auto& input = params.inputs[0]; - DimensionAccessHelper dims(input); + DimensionAccessHelperJit dims(input); std::string data_size; switch (params.ov_input_rank) { case 1 : diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp new file mode 100644 index 00000000000000..61028ef5348a1a --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp @@ -0,0 +1,125 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "sdpa_kernel_base.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + +static std::string GetDimsOrder(const std::vector& order_idx) { + auto get_order_idx = [](std::vector order_idx, int64_t dim_idx) { + int loc = 0; + for (auto idx : order_idx) { + if (idx == dim_idx) + break; + loc += 1; + } + return loc; + }; + + std::string dims_order = ""; + if (order_idx.size() == 2) { + const std::vector dims2 = {"y", "x"}; + dims_order = "b,f,w,z," + + dims2[get_order_idx(order_idx, 0)] + "," + dims2[get_order_idx(order_idx, 1)]; + } else if (order_idx.size() == 3) { + const std::vector dims3 = {"f", "y", "x"}; + dims_order = "b," + dims3[get_order_idx(order_idx, 0)] + ",w,z," + + dims3[get_order_idx(order_idx, 1)] + "," + dims3[get_order_idx(order_idx, 2)]; + } else if (order_idx.size() == 4) { + const std::vector dims4 = {"b", "f", "y", "x"}; + dims_order = dims4[get_order_idx(order_idx, 0)] + "," + dims4[get_order_idx(order_idx, 1)] + ",w,z," + + dims4[get_order_idx(order_idx, 2)] + "," + dims4[get_order_idx(order_idx, 3)]; + } else if (order_idx.size() == 5) { + const std::vector dims5 = {"b", "f", "z", "y", "x"}; + dims_order = dims5[get_order_idx(order_idx, 0)] + "," + dims5[get_order_idx(order_idx, 1)] + ",w," + + dims5[get_order_idx(order_idx, 2)] + "," + dims5[get_order_idx(order_idx, 3)] + "," + + dims5[get_order_idx(order_idx, 4)]; + } else if (order_idx.size() == 6) { + const std::vector dims6 = {"b", "f", "w", "z", "y", "x"}; + dims_order = dims6[get_order_idx(order_idx, 0)] + "," + dims6[get_order_idx(order_idx, 1)] + "," + + dims6[get_order_idx(order_idx, 2)] + "," + dims6[get_order_idx(order_idx, 3)] + "," + + dims6[get_order_idx(order_idx, 4)] + "," + dims6[get_order_idx(order_idx, 5)]; + } else { + dims_order = "b,f,w,z,y,x"; + } + return dims_order; +} + +static std::string GetBroadcastInputStr(const size_t input_rank, const int64_t axes, const int64_t val) { + std::vector dims; + if (input_rank == 1) { + dims = {"x"}; + } else if (input_rank == 2) { + dims = {"y", "x"}; + } else if (input_rank == 3) { + dims = {"f", "y", "x"}; + } else if (input_rank == 4) { + dims = {"b", "f", "y", "x"}; + } else if (input_rank == 5) { + dims = {"b", "f", "z", "y", "x"}; + } else if (input_rank == 6) { + dims = {"b", "f", "w", "z", "y", "x"}; + } + return dims[axes] + " /= " + std::to_string(val) + ";"; +} + +JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { + auto jit = MakeBaseParamsJitConstants(params); + + if (params.conf.broadcast_axis != -1) { + jit.AddConstant(MakeJitConstant("DO_BROADCAST_KEY_VALUE", GetBroadcastInputStr(params.inputs[0].GetDims().size(), + params.conf.broadcast_axis, + params.conf.group_size))); + } + + jit.AddConstant(MakeJitConstant("IS_CAUSAL", params.conf.is_causal)); + jit.AddConstant(MakeJitConstant("HAS_ATTN_MASK_INPUT", params.inputs.size() > 3)); + jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", params.inputs.size() > 4)); + + auto is_default_order = [](const std::vector& order) { + for (size_t i = 0; i < order.size(); i++) + if (order[i] != static_cast(i)) + return false; + return true; + }; + + if ((!params.input0_order.empty() && !is_default_order(params.input0_order)) || params.conf.broadcast_axis != -1) { + jit.AddConstant(MakeJitConstant("INPUT0_DIMS_ORDER", GetDimsOrder(params.input0_order))); + } + if ((!params.input1_order.empty() && !is_default_order(params.input1_order)) || params.conf.broadcast_axis != -1) { + jit.AddConstant(MakeJitConstant("INPUT1_DIMS_ORDER", GetDimsOrder(params.input1_order))); + } + if ((!params.input2_order.empty() && !is_default_order(params.input2_order)) || params.conf.broadcast_axis != -1) { + jit.AddConstant(MakeJitConstant("INPUT2_DIMS_ORDER", GetDimsOrder(params.input2_order))); + } + + TransposedDimensionAccessHelperJit dims_q(params.inputs[0], params.input0_order); + jit.AddConstant(MakeJitConstant("TARGET_SEQ_LEN", dims_q.y())); + jit.AddConstant(MakeJitConstant("NUM_HEADS", dims_q.f())); + + TransposedDimensionAccessHelperJit dims_k(params.inputs[1], params.input1_order); + jit.AddConstant(MakeJitConstant("SOURCE_SEQ_LEN", dims_k.y())); + + return jit; +} + +bool SDPAKernelBase::Validate(const Params& p) const { + if (p.GetType() != KernelType::SDPA) { + return false; + } + + const sdpa_params& params = static_cast(p); + + for (size_t i = 0; i < params.inputs.size(); i++) { + if (params.inputs[i].Dimentions() != 4) + return false; + } + + if (params.outputs[0].Dimentions() != 4) + return false; + + return true; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h new file mode 100644 index 00000000000000..1d4f30512df06b --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -0,0 +1,124 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "kernel_selector_params.h" +#include "kernel_selector_utils.h" +#include + +namespace kernel_selector { +struct TransposedDimensionAccessHelperBase : virtual DimensionAccessHelperBase { + explicit TransposedDimensionAccessHelperBase(const DataTensor& t, std::vector order) + : DimensionAccessHelperBase(t) { + size_t total_dims_count = dims.size(); + size_t new_axis_count = total_dims_count - order.size(); + + transposed_order.resize(total_dims_count); + std::iota(transposed_order.begin(), transposed_order.end(), 0); + for (size_t i = 0; i < order.size(); i++) { + size_t transposed_order_pos = i < 2 ? i : i + new_axis_count; + transposed_order[transposed_order_pos] = order[i] < 2 ? order[i] : order[i] + new_axis_count; + } + } + + Tensor::Dim& x_dim() { return dims[transposed_order[7]]; } + Tensor::Dim& y_dim() { return dims[transposed_order[6]]; } + Tensor::Dim& z_dim() { return dims[transposed_order[5]]; } + Tensor::Dim& w_dim() { return dims[transposed_order[4]]; } + Tensor::Dim& v_dim() { return dims[transposed_order[3]]; } + Tensor::Dim& u_dim() { return dims[transposed_order[2]]; } + Tensor::Dim& f_dim() { return dims[transposed_order[1]]; } + Tensor::Dim& b_dim() { return dims[transposed_order[0]]; } + + std::vector transposed_order; +}; + +struct TransposedDimensionAccessHelperJit : DimensionAccessHelperJit, TransposedDimensionAccessHelperBase { + explicit TransposedDimensionAccessHelperJit(const DataTensor& t, std::vector order, bool padded = false) + : DimensionAccessHelperBase(t) + , DimensionAccessHelperJit(t, padded) + , TransposedDimensionAccessHelperBase(t, order) {} + + std::string x() { return dims_sizes[transposed_order[7]]; } + std::string y() { return dims_sizes[transposed_order[6]]; } + std::string z() { return dims_sizes[transposed_order[5]]; } + std::string w() { return dims_sizes[transposed_order[4]]; } + std::string v() { return dims_sizes[transposed_order[3]]; } + std::string u() { return dims_sizes[transposed_order[2]]; } + std::string f() { return dims_sizes[transposed_order[1]]; } + std::string b() { return dims_sizes[transposed_order[0]]; } + + std::pair x_pad() { + return {pad_before_after_sizes[(transposed_order[7] * 2) + 0], pad_before_after_sizes[(transposed_order[7] * 2) + 1]}; + } + std::pair y_pad() { + return {pad_before_after_sizes[(transposed_order[6] * 2) + 0], pad_before_after_sizes[(transposed_order[6] * 2) + 1]}; + } + std::pair z_pad() { + return {pad_before_after_sizes[(transposed_order[5] * 2) + 0], pad_before_after_sizes[(transposed_order[5] * 2) + 1]}; + } + std::pair w_pad() { + return {pad_before_after_sizes[(transposed_order[4] * 2) + 0], pad_before_after_sizes[(transposed_order[4] * 2) + 1]}; + } + std::pair v_pad() { + return {pad_before_after_sizes[(transposed_order[3] * 2) + 0], pad_before_after_sizes[(transposed_order[3] * 2) + 1]}; + } + std::pair u_pad() { + return {pad_before_after_sizes[(transposed_order[2] * 2) + 0], pad_before_after_sizes[(transposed_order[2] * 2) + 1]}; + } + std::pair f_pad() { + return {pad_before_after_sizes[(transposed_order[1] * 2) + 0], pad_before_after_sizes[(transposed_order[1] * 2) + 1]}; + } + std::pair b_pad() { + return {pad_before_after_sizes[(transposed_order[0] * 2) + 0], pad_before_after_sizes[(transposed_order[0] * 2) + 1]}; + } +}; + +struct sdpa_configuration { + int64_t head_size = -1; + int64_t heads_num = -1; + int64_t kv_heads_num = -1; + + // GQA configuration + int64_t group_size = -1; + int64_t broadcast_axis = -1; + + bool is_causal = false; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// sdpa_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct sdpa_params : public base_params { + sdpa_params() : base_params(KernelType::SDPA) {} + + std::vector input0_order; + std::vector input1_order; + std::vector input2_order; + std::vector output_order; + + sdpa_configuration conf; +}; + +struct sdpa_fuse_params : fuse_params { + sdpa_fuse_params() : fuse_params(KernelType::SDPA) {} +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// SDPAKernelBase +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class SDPAKernelBase : public KernelBaseOpenCL { +public: + using KernelBaseOpenCL::KernelBaseOpenCL; + virtual ~SDPAKernelBase() {} + + struct DispatchData : public CommonDispatchData {}; + +protected: + bool Validate(const Params& p) const override; + JitConstants GetJitConstants(const sdpa_params& params) const; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp new file mode 100644 index 00000000000000..581565874f7fbb --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp @@ -0,0 +1,258 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "sdpa_kernel_opt.h" +#include "kernel_selector_utils.h" +#include +#include + +namespace kernel_selector { + +constexpr size_t subgroup_size = 16; + +enum KernelsTypes { + SINGLE_TOKEN = 0, + MULTI_TOKENS, + FINALIZATION, + TOTAL_KERNELS_NUM +}; + +static size_t get_target_seq_len_block_size() { + const size_t block_size = 16; + return block_size; +} + + +static size_t get_seq_len_partition_size() { + const size_t seq_len = 256; + return seq_len; +} + +ParamsKey SDPAKernelOpt::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + + k.EnableDifferentTypes(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDynamicShapesSupport(); + + return k; +} + +bool SDPAKernelOpt::Validate(const Params& p) const { + if (!Parent::Validate(p)) + return false; + + const sdpa_params& params = static_cast(p); + + if (params.conf.head_size < 1 || params.conf.head_size % subgroup_size != 0) + return false; + + return true; +} + +JitConstants SDPAKernelOpt::GetJitConstants(const sdpa_params& params, size_t kernel_idx) const { + auto jit = SDPAKernelBase::GetJitConstants(params); + + const auto softmax_acc_dt = params.inputs[0].GetDType(); + jit.Merge(MakeTypeJitConstants(softmax_acc_dt, "SOFTMAX_ACCUMULATOR")); + + const auto& config = params.conf; + jit.AddConstant(MakeJitConstant("SUBGROUP_SIZE", subgroup_size)); + jit.AddConstant(MakeJitConstant("HEAD_SIZE", config.head_size)); + jit.AddConstant(MakeJitConstant("SEQ_LEN_PARTITION_SIZE", get_seq_len_partition_size())); + + auto target_seq_len_block_size = kernel_idx == KernelsTypes::SINGLE_TOKEN ? 1 : get_target_seq_len_block_size(); + jit.AddConstant(MakeJitConstant("TARGET_SEQ_LEN_BLOCK_SIZE", target_seq_len_block_size)); + + auto sdpa_stage = kernel_idx == KernelsTypes::FINALIZATION ? 1 : 0; + jit.AddConstant(MakeJitConstant("SDPA_STAGE_" + std::to_string(sdpa_stage), 1)); + + return jit; +} + +CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t kernel_idx) const { + CommonDispatchData dispatch_data; + + const auto& query_input = params.inputs[0]; + + if (!query_input.is_dynamic()) { + TransposedDimensionAccessHelperBase dims_q(params.inputs[0], params.input0_order); + TransposedDimensionAccessHelperBase dims_k(params.inputs[1], params.input1_order); + TransposedDimensionAccessHelperBase output(params.outputs[0], params.output_order); + + const size_t batch_size = output.b_dim().v; + const size_t heads_num = output.f_dim().v; + const size_t source_seq_len = dims_k.y_dim().v; + const size_t target_seq_len = dims_q.y_dim().v; + const size_t head_size = static_cast(params.conf.head_size); + const size_t num_of_partitions = CeilDiv(source_seq_len, get_seq_len_partition_size()); + const size_t target_seq_len_block_size = kernel_idx == 1 ? get_target_seq_len_block_size() : 1; + + if (kernel_idx == KernelsTypes::SINGLE_TOKEN || kernel_idx == KernelsTypes::MULTI_TOKENS) { + dispatch_data.gws = { batch_size * heads_num, + CeilDiv(target_seq_len, target_seq_len_block_size), + head_size * num_of_partitions }; + dispatch_data.lws = { 1, 1, head_size }; + } else if (kernel_idx == 2) { + dispatch_data.gws = { batch_size * heads_num, + target_seq_len, + 16 }; + dispatch_data.lws = { 1, 1, 16 }; + } + } + + return dispatch_data; +} + +KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const { + if (!Validate(params)) { + return {}; + } + + // Implementation contains multiple kernels: + // kernel[0] - single token generation stage (2nd token) + // kernel[1] - multi tokens processing stage (1st token) + // kernel[2] - results aggregation + + const size_t kernels_num = KernelsTypes::TOTAL_KERNELS_NUM; + KernelData kd = KernelData::Default(params, kernels_num); + kd.needs_sub_kernels_sync = true; + + GetUpdateDispatchDataFunc(kd); + + const auto& prim_params = dynamic_cast(params); + for (size_t kernel_idx = 0; kernel_idx < kernels_num; kernel_idx++) { + auto dispatch_data = SetDefault(prim_params, kernel_idx); + auto kernel_name = kernel_idx == 0 ? kernelName + "_single_token" : + kernel_idx == 1 ? kernelName + "_multi_tokens" : kernelName + "_finalization"; + auto entry_point = GetEntryPoint(kernel_name, prim_params.layerID, params); + auto jit_constants = GetJitConstants(prim_params, kernel_idx); + auto jit = CreateJit(kernel_name, jit_constants, entry_point); + + auto& kernel = kd.kernels[kernel_idx]; + + auto inputs_num = + kernel_idx == KernelsTypes::FINALIZATION ? 0 : static_cast(prim_params.inputs.size()); + + FillCLKernelData(kernel, + dispatch_data, + params.engineInfo, + kernelName, + jit, + entry_point, + {}, + false, + false, + inputs_num, + GetFusedPrimitiveInputsCount(params), + static_cast(prim_params.outputs.size()), + prim_params.is_shape_agnostic); + + const auto num_of_partitions = 1; + auto& output = prim_params.outputs[0]; + auto head_size = output.X().v; + + auto buf_dt_size = 4; + auto buf_elements_count = (num_of_partitions == 1) ? 1 : output.LogicalSize() / head_size * num_of_partitions; + auto buf_size = buf_elements_count * buf_dt_size; + + auto tmp_out_dt_size = 4; + auto tmp_out_elements_count = (num_of_partitions == 1) ? 1 : output.LogicalSize() * num_of_partitions; + auto tmp_out_size = tmp_out_elements_count * tmp_out_dt_size; + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2}); + + kd.internalBufferSizes.clear(); + kd.internalBufferSizes.push_back(buf_size); + kd.internalBufferSizes.push_back(buf_size); + kd.internalBufferSizes.push_back(tmp_out_size); + kd.internalBufferDataType = prim_params.inputs[0].GetDType(); + + if (kernel_idx == KernelsTypes::FINALIZATION) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); + + ScalarDescriptor num_of_partitions_scalar; + num_of_partitions_scalar.t = ScalarDescriptor::Types::UINT32; + num_of_partitions_scalar.v.u32 = num_of_partitions; + + kernel.params.scalars.clear(); + kernel.params.scalars.push_back(num_of_partitions_scalar); + } + } + + return { kd }; +} + +void SDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kernel_data) { + const auto& prim_params = static_cast(params); + + const size_t expected_kernels_num = KernelsTypes::TOTAL_KERNELS_NUM; + OPENVINO_ASSERT(kernel_data.kernels.size() == expected_kernels_num, + "[GPU] Invalid kernels size for update dispatch data func of SDPA kernel"); + + TransposedDimensionAccessHelperBase dims_q(prim_params.inputs[0], prim_params.input0_order); + TransposedDimensionAccessHelperBase dims_k(prim_params.inputs[1], prim_params.input1_order); + auto& output = prim_params.outputs[0]; + + auto target_seq_len = dims_q.y_dim().v; + auto head_size = dims_q.x_dim().v; + auto source_seq_len = dims_k.y_dim().v; + + auto num_of_partitions = CeilDiv(source_seq_len, get_seq_len_partition_size()); + + auto buf_dt_size = output.ElementSize(); + auto buf_elements_count = (num_of_partitions == 1) ? 1 : output.LogicalSize() / head_size * num_of_partitions; + auto buf_size = buf_elements_count * buf_dt_size; + + auto tmp_out_dt_size = output.ElementSize(); + auto tmp_out_elements_count = (num_of_partitions == 1) ? 1 : output.LogicalSize() * num_of_partitions; + auto tmp_out_size = tmp_out_elements_count * tmp_out_dt_size; + + auto dispatch_data1 = SetDefault(prim_params, 0); + kernel_data.kernels[0].params.workGroups.global = dispatch_data1.gws; + kernel_data.kernels[0].params.workGroups.local = dispatch_data1.lws; + kernel_data.kernels[0].skip_execution = target_seq_len > 1; + + auto dispatch_data2 = SetDefault(prim_params, 1); + kernel_data.kernels[1].params.workGroups.global = dispatch_data2.gws; + kernel_data.kernels[1].params.workGroups.local = dispatch_data2.lws; + kernel_data.kernels[1].skip_execution = target_seq_len == 1; + + ScalarDescriptor num_of_partitions_scalar; + num_of_partitions_scalar.t = ScalarDescriptor::Types::UINT32; + num_of_partitions_scalar.v.u32 = static_cast(num_of_partitions); + + auto dispatch_data3 = SetDefault(prim_params, 2); + kernel_data.kernels[2].params.workGroups.global = dispatch_data3.gws; + kernel_data.kernels[2].params.workGroups.local = dispatch_data3.lws; + kernel_data.kernels[2].skip_execution = num_of_partitions == 1; + + kernel_data.kernels[2].params.scalars.clear(); + kernel_data.kernels[2].params.scalars.push_back(num_of_partitions_scalar); + + kernel_data.internalBufferSizes.clear(); + kernel_data.internalBufferSizes.push_back(buf_size); + kernel_data.internalBufferSizes.push_back(buf_size); + kernel_data.internalBufferSizes.push_back(tmp_out_size); + kernel_data.internalBufferDataType = prim_params.inputs[0].GetDType(); + }; +} + +KernelsPriority SDPAKernelOpt::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_1; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h new file mode 100644 index 00000000000000..8d7279f5546112 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.h @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "sdpa_kernel_base.h" + +namespace kernel_selector { +class SDPAKernelOpt : public SDPAKernelBase { +public: + using Parent = SDPAKernelBase; + SDPAKernelOpt() : SDPAKernelBase("sdpa_opt") {} + virtual ~SDPAKernelOpt() {} + + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params& p) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; + CommonDispatchData SetDefault(const sdpa_params& params, size_t kernel_idx) const; + JitConstants GetJitConstants(const sdpa_params& params, size_t kernel_idx) const; + std::vector GetSupportedFusedOps() const override { + return {}; + } +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp new file mode 100644 index 00000000000000..a80f3c31dfc8f3 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "sdpa_kernel_ref.h" +#include "kernel_selector_utils.h" +#include +#include + +namespace kernel_selector { + +ParamsKey SDPAKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + + k.EnableDifferentTypes(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDynamicShapesSupport(); + + return k; +} + +JitConstants SDPAKernelRef::GetJitConstants(const sdpa_params& params) const { + auto jit = SDPAKernelBase::GetJitConstants(params); + + auto acc_dt = params.inputs[0].GetDType(); + jit.Merge(MakeTypeJitConstants(acc_dt, "ACCUMULATOR")); + + TransposedDimensionAccessHelperJit dims_q(params.inputs[0], params.input0_order); + jit.AddConstant(MakeJitConstant("HEAD_SIZE", dims_q.x())); + + return jit; +} + +CommonDispatchData SDPAKernelRef::SetDefault(const sdpa_params& params) const { + CommonDispatchData dispatchData; + + const auto& output = params.outputs[0]; + dispatchData.gws = { output.Batch().v * output.Feature().v, output.Y().v, output.X().v }; + dispatchData.lws = { 1, 1, output.X().v }; + + return dispatchData; +} + +KernelsData SDPAKernelRef::GetKernelsData(const Params& params) const { + KernelData kd = KernelData::Default(params); + const auto& prim_params = dynamic_cast(params); + + if (!Validate(params)) { + return {}; + } + + auto dispatchData = SetDefault(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params); + auto cldnn_jit = GetJitConstants(prim_params); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + + GetUpdateDispatchDataFunc(kd); + + FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, + "", false, false, static_cast(prim_params.inputs.size()), + GetFusedPrimitiveInputsCount(params), 1, prim_params.is_shape_agnostic); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + + kd.internalBufferSizes.clear(); + kd.internalBufferSizes.push_back(prim_params.inputs[0].ElementSize()); + kd.internalBufferDataType = prim_params.inputs[0].GetDType(); + + return { kd }; +} + +void SDPAKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kernel_data) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kernel_data.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kernel_data.kernels[0].params.workGroups.global = dispatchData.gws; + kernel_data.kernels[0].params.workGroups.local = dispatchData.lws; + kernel_data.kernels[0].skip_execution = KernelData::SkipKernelExecution(prim_params); + + auto& in_q = prim_params.inputs[0]; + auto& in_k = prim_params.inputs[1]; + TransposedDimensionAccessHelperBase dims_q(in_q, prim_params.input0_order); + TransposedDimensionAccessHelperBase dims_k(in_k, prim_params.input1_order); + + auto elem_size = in_q.ElementSize(); + auto batch_size = in_q.LogicalSize() / dims_q.x_dim().v / dims_q.y_dim().v; + kernel_data.internalBufferSizes.clear(); + kernel_data.internalBufferSizes.push_back(batch_size * dims_q.y_dim().v * dims_k.y_dim().v * elem_size); + + kernel_data.internalBufferDataType = in_q.GetDType(); + }; +} + +KernelsPriority SDPAKernelRef::GetKernelsPriority(const Params& /*params*/) const { + return DONT_USE_IF_HAVE_SOMETHING_ELSE; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.h new file mode 100644 index 00000000000000..c570f32cc1e94e --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.h @@ -0,0 +1,28 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "sdpa_kernel_base.h" + +namespace kernel_selector { +class SDPAKernelRef : public SDPAKernelBase { +public: + using Parent = SDPAKernelBase; + SDPAKernelRef() : SDPAKernelBase("sdpa_ref") {} + virtual ~SDPAKernelRef() {} + + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + +protected: + void GetUpdateDispatchDataFunc(KernelData& kd) const override; + CommonDispatchData SetDefault(const sdpa_params& params) const; + JitConstants GetJitConstants(const sdpa_params& params) const; + std::vector GetSupportedFusedOps() const override { + return {}; + } +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp new file mode 100644 index 00000000000000..b58f04f23e2643 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "sdpa_kernel_selector.h" +#include "sdpa_kernel_ref.h" +#include "sdpa_kernel_opt.h" + +namespace kernel_selector { + +sdpa_kernel_selector::sdpa_kernel_selector() { + Attach(); + Attach(); +} + +KernelsData sdpa_kernel_selector::GetBestKernels(const Params& params) const { + return GetNaiveBestKernel(params, KernelType::SDPA); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.h new file mode 100644 index 00000000000000..e4a5f245bfe18b --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.h @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { +class sdpa_kernel_selector : public kernel_selector_base { +public: + static sdpa_kernel_selector& Instance() { + static sdpa_kernel_selector instance_; + return instance_; + } + + sdpa_kernel_selector(); + + virtual ~sdpa_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp index ee6f39c3c3c71e..34279dd7de148c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/slice/slice_kernel_ref.cpp @@ -122,7 +122,7 @@ JitConstants SliceKernelRef::GetJitConstants(const slice_params& params) const { // Define axes size as constant: if (params.compile_time_axes.empty()) { - kernel_selector::DimensionAccessHelper dims(params.inputs.back()); + kernel_selector::DimensionAccessHelperJit dims(params.inputs.back()); jit.AddConstant(MakeJitConstant(JIT_AXES_BUFF_SIZE_NAME, toVectorMulString({dims.b(), dims.f(), dims.x(), dims.y(), dims.z()}))); } else { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp index 335c2bc1017303..338ed8d3fb1077 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp @@ -115,7 +115,7 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis if (params.has_dynamic_tensors()) { const auto& input = params.inputs[0]; - DimensionAccessHelper dims(input); + DimensionAccessHelperJit dims(input); auto softmax_dim_y_bfyx = (params.dim == SoftmaxDim::Y && input.GetLayout() == DataLayout::bfyx); auto softmax_dim_x_bfyx = (params.dim == SoftmaxDim::X && input.GetLayout() == DataLayout::bfyx); const std::string lws_0 = "get_local_size(0)"; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp index 5aafdd309ae6d0..5d20503919241b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp @@ -216,7 +216,7 @@ JitConstants UniqueCountKernelRef::GetJitConstants(const unique_count_params& ke } if (input.is_dynamic()) { - DimensionAccessHelper dims(input); + DimensionAccessHelperJit dims(input); const std::string total_data_size = toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()}); jit_constants.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", total_data_size)); @@ -326,7 +326,7 @@ JitConstants UniqueGatherKernelRef::GetJitConstants(const unique_gather_params& } if (input.is_dynamic()) { - DimensionAccessHelper dims(input); + DimensionAccessHelperJit dims(input); const std::string total_data_size = toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()}); jit_constants.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", total_data_size)); diff --git a/src/plugins/intel_gpu/src/plugin/ops/roi_align_rotated.cpp b/src/plugins/intel_gpu/src/plugin/ops/roi_align_rotated.cpp index 306bab54721ddd..fd50b4dcc76425 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/roi_align_rotated.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/roi_align_rotated.cpp @@ -14,7 +14,7 @@ namespace intel_gpu { namespace { -void CreateROIAlignRotatedOp(ProgramBuilder& p, const std::shared_ptr& op) { +void CreateROIAlignRotatedOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {3}); auto roi_align_prim = cldnn::roi_align(layer_type_name_ID(op), p.GetInputInfo(op), @@ -31,7 +31,7 @@ void CreateROIAlignRotatedOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {3, 4, 5}); + auto inputs = p.GetInputInfo(op); + auto layerName = layer_type_name_ID(op); + + bool is_causal = op->get_causal(); + auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName, + inputs, + is_causal); + + p.add_primitive(*op, sdpa_prim); +} + +static void CreateSDPAOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {3, 4, 5}); + auto inputs = p.GetInputInfo(op); + auto layerName = layer_type_name_ID(op); + + bool is_causal = op->get_causal(); + auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName, + inputs, + is_causal, + op->get_input0_transpose_order(), + op->get_input1_transpose_order(), + op->get_input2_transpose_order(), + op->get_output_transpose_order()); + + p.add_primitive(*op, sdpa_prim); +} + +REGISTER_FACTORY_IMPL(internal, SDPA); +REGISTER_FACTORY_IMPL(v13, ScaledDotProductAttention); + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp new file mode 100644 index 00000000000000..67e927abb43f97 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp @@ -0,0 +1,171 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/op/sdpa.hpp" +#include "intel_gpu/plugin/common_utils.hpp" +#include "scaled_dot_product_attention_shape_inference.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/reshape.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +SDPA::SDPA(const ov::Output& Q, + const ov::Output& K, + const ov::Output& V, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const bool is_causal, + const ov::element::Type output_type) + : m_order_q(order_q) + , m_order_k(order_k) + , m_order_v(order_v) + , m_order_out(order_out) + , m_is_causal(is_causal) + , m_output_type(output_type) { + set_arguments({Q, K, V}); + validate_and_infer_types(); +} + +SDPA::SDPA(const ov::Output& Q, + const ov::Output& K, + const ov::Output& V, + const ov::Output& attn_mask, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const bool is_causal, + const ov::element::Type output_type) + : m_order_q(order_q) + , m_order_k(order_k) + , m_order_v(order_v) + , m_order_out(order_out) + , m_is_causal(is_causal) + , m_output_type(output_type) { + set_arguments({Q, K, V, attn_mask}); + validate_and_infer_types(); +} + +SDPA::SDPA(const ov::Output& Q, + const ov::Output& K, + const ov::Output& V, + const ov::Output& attn_mask, + const ov::Output& scale, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const bool is_causal, + const ov::element::Type output_type) + : m_order_q(order_q) + , m_order_k(order_k) + , m_order_v(order_v) + , m_order_out(order_out) + , m_is_causal(is_causal) + , m_output_type(output_type) { + set_arguments({Q, K, V, attn_mask, scale}); + validate_and_infer_types(); +} + +std::shared_ptr SDPA::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), m_order_q, m_order_k, m_order_v, m_order_out, m_is_causal, m_output_type); +} + +void SDPA::validate_and_infer_types() { + const auto input_size = get_input_size(); + NODE_VALIDATION_CHECK(this, + input_size == 3 || input_size == 4 || input_size == 5, + "Number of inputs is incorrect. Current value is: ", + input_size, + ", expected 3, 4 or 5."); + + std::vector input_shapes; + for (size_t i = 0; i < input_size; i++) { + input_shapes.push_back(get_input_partial_shape(i)); + } + + auto out_shapes = shape_infer(this, + input_shapes, + m_order_q, + m_order_k, + m_order_v, + m_order_out); + + auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + set_output_type(0, output_type, out_shapes[0]); +} + +bool SDPA::visit_attributes(ov::AttributeVisitor &visitor) { + visitor.on_attribute("order_q", m_order_q); + visitor.on_attribute("order_k", m_order_k); + visitor.on_attribute("order_v", m_order_v); + visitor.on_attribute("order_out", m_order_out); + visitor.on_attribute("output_type", m_output_type); + return true; +} + +std::vector shape_infer(const SDPA* op, + std::vector input_shapes, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out) { + auto shape_q = input_shapes[0]; + auto shape_k = input_shapes[1]; + auto shape_v = input_shapes[2]; + + // transposed shape + auto transpose_pshape = [](const ov::PartialShape pshape, const std::vector& order) { + auto transposed_pshape = ov::PartialShape::dynamic(pshape.rank()); + for (size_t i = 0; i < order.size(); i++) { + transposed_pshape[i] = pshape[order[i]]; + } + + return transposed_pshape; + }; + + auto shape_q_t = (order_q.size() > 1) ? transpose_pshape(shape_q, order_q) : shape_q; + auto shape_k_t = (order_k.size() > 1) ? transpose_pshape(shape_k, order_k) : shape_k; + auto shape_v_t = (order_v.size() > 1) ? transpose_pshape(shape_v, order_v) : shape_v; + + const auto is_broadcastable = shape_k_t.rank().is_static() && + shape_v_t.rank().is_static() && + ((shape_q_t.size() == shape_k_t.size()) && (shape_q_t.size() == shape_v_t.size())); + if (is_broadcastable) { + size_t max_rank = shape_q_t.size(); + for (size_t i = 0; i < max_rank; ++i) { + if (shape_q_t[i].is_static() && shape_k_t[i].is_static() && shape_v_t[i].is_static()) { + auto broadcasted_dim = shape_q_t[i].get_length(); + shape_k_t[i] = broadcasted_dim; + shape_v_t[i] = broadcasted_dim; + } + } + } + + std::vector transposed_input_shapes{ shape_q_t, shape_k_t, shape_v_t }; + for (size_t i = 3; i < transposed_input_shapes.size(); i++) { + transposed_input_shapes.push_back(input_shapes[i]); + } + + OPENVINO_ASSERT(op != nullptr, "op should not be nullptr for shape_infer."); + auto out_shapes = ov::op::v13::shape_infer(dynamic_cast(op), transposed_input_shapes); + + if (order_out.size() > 0) { + return { transpose_pshape(out_shapes[0], order_out) }; + } else { + return { out_shapes[0] }; + } +} + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp similarity index 56% rename from src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp rename to src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp index e57a7978a5e7bf..614a42845ec521 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp @@ -3,14 +3,16 @@ // #include "intel_gpu/op/gemm.hpp" +#include "intel_gpu/op/sdpa.hpp" #include "openvino/core/node_vector.hpp" #include "openvino/core/partial_shape.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/constant.hpp" #include "openvino/pass/pattern/op/label.hpp" #include "openvino/pass/pattern/op/pattern.hpp" -#include "transpose_matmul_fusion.hpp" +#include "transpose_fusion.hpp" #include "openvino/op/matmul.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" #include "openvino/op/convert.hpp" #include "openvino/op/transpose.hpp" #include "openvino/core/rt_info.hpp" @@ -25,23 +27,133 @@ using ov::pass::pattern::op::Or; namespace ov { namespace intel_gpu { -class TransposeMatMulMatcher : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("TransposeMatMulMatcher", "0"); - TransposeMatMulMatcher(); -}; - -class TransposeMatMulTransposeMatcher : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("TransposeMatMulTransposeMatcher", "0"); - TransposeMatMulTransposeMatcher(); -}; - -TransposeMatMulFusion::TransposeMatMulFusion() { +TransposeFusion::TransposeFusion() { add_matcher(); add_matcher(); + add_matcher(); } +TransposeSDPAMatcher::TransposeSDPAMatcher() { + auto is_fp_type = [](const ov::Output& output) -> bool { + switch (output.get_element_type()) { + case ov::element::f16: + case ov::element::f32: return true; + default: return false; + } + }; + auto not_transpose = [is_fp_type](const ov::Output& output) -> bool { + return std::dynamic_pointer_cast(output.get_node_shared_ptr()) == nullptr + && is_fp_type(output); + }; + auto is_dynamic = [](const ov::Output& output) -> bool { + bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic(); + size_t num_inputs = output.get_node_shared_ptr()->get_input_size(); + for (size_t idx = 0; idx < num_inputs; idx++) { + is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic(); + } + return is_dynamic; + }; + + auto input_q_m = any_input(not_transpose); + auto input_k_m = any_input(not_transpose); + auto input_v_m = any_input(not_transpose); + auto input_attn_mask = any_input(not_transpose); + auto input_scale = any_input(not_transpose); + auto transpose_q_order_m = wrap_type(consumers_count(1)); + auto transpose_k_order_m = wrap_type(consumers_count(1)); + auto transpose_v_order_m = wrap_type(consumers_count(1)); + auto transpose_q_m = wrap_type({input_q_m, transpose_q_order_m}, is_fp_type); + auto transpose_k_m = wrap_type({input_k_m, transpose_k_order_m}, is_fp_type); + auto transpose_v_m = wrap_type({input_v_m, transpose_v_order_m}, is_fp_type); + + auto sdpa_in_q = std::make_shared(OutputVector{input_q_m, transpose_q_m}); + auto sdpa_in_k = std::make_shared(OutputVector{input_k_m, transpose_k_m}); + auto sdpa_in_v = std::make_shared(OutputVector{input_v_m, transpose_v_m}); + + auto sdpa_without_attn_mask_m = wrap_type({ sdpa_in_q, sdpa_in_k, sdpa_in_v }, is_dynamic); + auto sdpa_with_attn_mask_m = wrap_type({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask }, is_dynamic); + auto sdpa_with_attn_mask_and_scale_m = + wrap_type({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask, input_scale }, is_dynamic); + + auto sdpa_m = std::make_shared(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + std::shared_ptr sdpa; + if (pattern_map.find(sdpa_without_attn_mask_m) != pattern_map.end()) { + sdpa = std::dynamic_pointer_cast(pattern_map.at(sdpa_without_attn_mask_m).get_node_shared_ptr()); + } else if (pattern_map.find(sdpa_with_attn_mask_m) != pattern_map.end()) { + sdpa = std::dynamic_pointer_cast(pattern_map.at(sdpa_with_attn_mask_m).get_node_shared_ptr()); + } else if (pattern_map.find(sdpa_with_attn_mask_and_scale_m) != pattern_map.end()) { + sdpa = std::dynamic_pointer_cast(pattern_map.at(sdpa_with_attn_mask_and_scale_m).get_node_shared_ptr()); + } + + if (!sdpa || transformation_callback(sdpa)) { + return false; + } + + auto order_q = op::SDPA::default_order(sdpa->get_input_partial_shape(0).size()); + auto order_k = op::SDPA::default_order(sdpa->get_input_partial_shape(1).size()); + auto order_v = op::SDPA::default_order(sdpa->get_input_partial_shape(2).size()); + auto order_output = op::SDPA::default_order(sdpa->get_output_partial_shape(0).size()); + size_t input_q_output_idx = sdpa->get_input_source_output(0).get_index(); + size_t input_k_output_idx = sdpa->get_input_source_output(1).get_index(); + size_t input_v_output_idx = sdpa->get_input_source_output(2).get_index(); + + if (pattern_map.count(transpose_q_m) > 0) { + auto tranpose_a_order = std::dynamic_pointer_cast(pattern_map.at(transpose_q_order_m).get_node_shared_ptr()); + order_q = tranpose_a_order->cast_vector(); + if (order_q.back() != static_cast(order_q.size() - 1)) // Allow any transposes without head_size dim position change + return false; + + auto tranpose_a = std::dynamic_pointer_cast(pattern_map.at(transpose_q_m).get_node_shared_ptr()); + input_q_output_idx = tranpose_a->get_input_source_output(0).get_index(); + } + if (pattern_map.count(transpose_k_m) > 0) { + auto tranpose_b_order = std::dynamic_pointer_cast(pattern_map.at(transpose_k_order_m).get_node_shared_ptr()); + order_k = tranpose_b_order->cast_vector(); + if (order_k.back() != static_cast(order_k.size() - 1)) // Allow any transposes without head_size dim position change + return false; + + auto tranpose_b = std::dynamic_pointer_cast(pattern_map.at(transpose_k_m).get_node_shared_ptr()); + input_k_output_idx = tranpose_b->get_input_source_output(0).get_index(); + } + if (pattern_map.count(transpose_v_m) > 0) { + auto tranpose_c_order = std::dynamic_pointer_cast(pattern_map.at(transpose_v_order_m).get_node_shared_ptr()); + order_v = tranpose_c_order->cast_vector(); + if (order_v.back() != static_cast(order_v.size() - 1)) // Allow any transposes without head_size dim position change + return false; + + auto tranpose_c = std::dynamic_pointer_cast(pattern_map.at(transpose_k_m).get_node_shared_ptr()); + input_v_output_idx = tranpose_c->get_input_source_output(0).get_index(); + } + + auto input_q = ov::Output(pattern_map.at(input_q_m).get_node_shared_ptr(), input_q_output_idx); + auto input_k = ov::Output(pattern_map.at(input_k_m).get_node_shared_ptr(), input_k_output_idx); + auto input_v = ov::Output(pattern_map.at(input_v_m).get_node_shared_ptr(), input_v_output_idx); + + std::shared_ptr sdpa_new; + if (pattern_map.find(sdpa_without_attn_mask_m) != pattern_map.end()) { + sdpa_new = std::make_shared(input_q, input_k, input_v, order_q, order_k, order_v, order_output, sdpa->get_causal()); + } else if (pattern_map.find(sdpa_with_attn_mask_m) != pattern_map.end()) { + auto attn_mask = sdpa->get_input_source_output(3); + sdpa_new = std::make_shared(input_q, input_k, input_v, attn_mask, order_q, order_k, order_v, order_output, sdpa->get_causal()); + } else if (pattern_map.find(sdpa_with_attn_mask_and_scale_m) != pattern_map.end()) { + auto attn_mask = sdpa->get_input_source_output(3); + auto scale = sdpa->get_input_source_output(4); + sdpa_new = std::make_shared(input_q, input_k, input_v, attn_mask, scale, order_q, order_k, order_v, order_output, sdpa->get_causal()); + } + + sdpa_new->set_friendly_name(sdpa->get_friendly_name()); + ov::copy_runtime_info(m.get_matched_nodes(), sdpa_new); + ov::replace_node(sdpa, sdpa_new); + return true; + }; + + auto m = std::make_shared(sdpa_m, "TransposeSDPAMatcher"); + this->register_matcher(m, callback); +} TransposeMatMulMatcher::TransposeMatMulMatcher() { auto is_fp_type = [](const ov::Output& output) -> bool { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.hpp new file mode 100644 index 00000000000000..a9b3ebe05317f3 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +class TransposeFusion: public ov::pass::GraphRewrite { +public: + OPENVINO_RTTI("TransposeFusion", "0"); + TransposeFusion(); +}; + +class TransposeMatMulMatcher : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("TransposeMatMulMatcher", "0"); + TransposeMatMulMatcher(); +}; + +class TransposeMatMulTransposeMatcher : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("TransposeMatMulTransposeMatcher", "0"); + TransposeMatMulTransposeMatcher(); +}; + +class TransposeSDPAMatcher : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("TransposeSDPAMatcher", "0"); + TransposeSDPAMatcher(); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.hpp deleted file mode 100644 index b24d76059ada11..00000000000000 --- a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.hpp +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "openvino/pass/graph_rewrite.hpp" - -namespace ov { -namespace intel_gpu { - -class TransposeMatMulFusion: public ov::pass::GraphRewrite { -public: - OPENVINO_RTTI("TransposeMatMulFusion", "0"); - TransposeMatMulFusion(); -}; - -} // namespace intel_gpu -} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp new file mode 100644 index 00000000000000..3fdb3794585106 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp @@ -0,0 +1,134 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "unsqueeze_broadcast_reshape_sdpa_fusion.hpp" + +#include "intel_gpu/op/sdpa.hpp" +#include "intel_gpu/op/kv_cache.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/unsqueeze.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "transformations/utils/utils.hpp" + +namespace ov { +namespace intel_gpu { +using ov::pass::pattern::op::Or; + +UnsqueezeBroadcastReshapeSDPAFusion::UnsqueezeBroadcastReshapeSDPAFusion() { + using namespace ov::pass::pattern; + + auto not_reshape = [](const ov::Output& output) -> bool { + return std::dynamic_pointer_cast(output.get_node_shared_ptr()) == nullptr; + }; + + auto unsqueeze_predicate = [](const ov::Output& output) -> bool { + return rank_equals(5)(output) && consumers_count(1); + }; + + auto broadcast_predicate = [](const ov::Output& output) -> bool { + const auto broadcast = ov::as_type_ptr(output.get_node_shared_ptr()); + if (!broadcast || broadcast->get_broadcast_spec().m_type != ov::op::BroadcastType::BIDIRECTIONAL) + return false; + return rank_equals(5)(output) && consumers_count(1); + }; + + auto reshape_predicate = [](const ov::Output& output) -> bool { + return rank_equals(4)(output) && consumers_count(1); + }; + + auto input_a_m = any_input(not_reshape); + auto input_attn_mask = any_input(); + auto input_scale = any_input(); + auto input_b_m = wrap_type({any_input(), any_input()}); + auto input_c_m = wrap_type({any_input(), any_input()}); + auto axes_const_b_m = wrap_type(); + auto axes_const_c_m = wrap_type(); + auto unsqueeze_b_m = wrap_type({input_b_m, axes_const_b_m}, unsqueeze_predicate); + auto unsqueeze_c_m = wrap_type({input_c_m, axes_const_c_m}, unsqueeze_predicate); + auto broadcast_b_m = wrap_type({unsqueeze_b_m, any_input()}, broadcast_predicate); + auto broadcast_c_m = wrap_type({unsqueeze_c_m, any_input()}, broadcast_predicate); + auto reshape_b_m = wrap_type({broadcast_b_m, any_input()}, reshape_predicate); + auto reshape_c_m = wrap_type({broadcast_c_m, any_input()}, reshape_predicate); + + auto sdpa_without_attn_mask_m = wrap_type({ input_a_m, reshape_b_m, reshape_c_m }); + auto sdpa_with_attn_mask_m = wrap_type({ input_a_m, reshape_b_m, reshape_c_m, input_attn_mask }); + auto sdpa_with_attn_mask_and_scale_m = wrap_type({ input_a_m, reshape_b_m, reshape_c_m, input_attn_mask, input_scale }); + + auto sdpa_m = std::make_shared(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + if (transformation_callback(m.get_match_root())) { + return false; + } + const auto& pattern_map = m.get_pattern_value_map(); + + auto valid_broadcast_target_shape = [](const std::vector& target_shape) { + return std::count_if(target_shape.begin(), target_shape.end(), [](int32_t s) { return s != 1; }) == 1; + }; + auto broadcast_b = std::dynamic_pointer_cast(pattern_map.at(broadcast_b_m).get_node_shared_ptr()); + auto broadcast_c = std::dynamic_pointer_cast(pattern_map.at(broadcast_c_m).get_node_shared_ptr()); + + std::vector target_shape_val_b; + auto target_shape_constant_b = std::dynamic_pointer_cast(broadcast_c->get_input_node_shared_ptr(1)); + if (target_shape_constant_b) { + target_shape_val_b = target_shape_constant_b->cast_vector(); + if (!valid_broadcast_target_shape(target_shape_val_b)) { + return false; + } + } + + std::vector target_shape_val_c; + auto target_shape_constant_c = std::dynamic_pointer_cast(broadcast_b->get_input_node_shared_ptr(1)); + if (target_shape_constant_c) { + target_shape_val_c = target_shape_constant_c->cast_vector(); + if (!valid_broadcast_target_shape(target_shape_val_c)) { + return false; + } + } + + // Expect the same broadcast rules for key and value inputs + if (target_shape_val_b != target_shape_val_c) { + return false; + } + + auto input_a = pattern_map.at(input_a_m).get_node_shared_ptr(); + auto input_b = pattern_map.at(input_b_m).get_node_shared_ptr(); + auto input_c = pattern_map.at(input_c_m).get_node_shared_ptr(); + + auto sdpa = std::dynamic_pointer_cast(m.get_match_root()); + auto order_a = sdpa->get_input0_transpose_order(); + auto order_b = sdpa->get_input1_transpose_order(); + auto order_c = sdpa->get_input2_transpose_order(); + auto order_d = sdpa->get_output_transpose_order(); + + std::shared_ptr sdpa_new; + if (pattern_map.find(sdpa_without_attn_mask_m) != pattern_map.end()) { + sdpa_new = std::make_shared(input_a, input_b, input_c, order_a, order_b, order_c, order_d, sdpa->get_causal()); + } else if (pattern_map.find(sdpa_with_attn_mask_m) != pattern_map.end()) { + auto attn_mask = sdpa->get_input_source_output(3); + sdpa_new = std::make_shared(input_a, input_b, input_c, attn_mask, order_a, order_b, order_c, order_d, sdpa->get_causal()); + } else if (pattern_map.find(sdpa_with_attn_mask_and_scale_m) != pattern_map.end()) { + auto attn_mask = sdpa->get_input_source_output(3); + auto scale = sdpa->get_input_source_output(4); + sdpa_new = std::make_shared(input_a, input_b, input_c, attn_mask, scale, order_a, order_b, order_c, order_d, sdpa->get_causal()); + } + + sdpa_new->set_friendly_name(sdpa->get_friendly_name()); + ov::copy_runtime_info(m.get_matched_nodes(), sdpa_new); + ov::replace_node(sdpa, sdpa_new); + + return true; + }; + + auto m = std::make_shared(sdpa_m, "UnsqueezeBroadcastReshapeSDPAFusion"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp new file mode 100644 index 00000000000000..ede3ac16fb51b5 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +class UnsqueezeBroadcastReshapeSDPAFusion : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("UnsqueezeBroadcastReshapeSDPAFusion", "0"); + UnsqueezeBroadcastReshapeSDPAFusion(); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 0c690dfe7d6df1..5d8db18151cd4e 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -61,11 +61,12 @@ #include "plugin/transformations/bcast_and_pad_zp_buffers.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" #include "plugin/transformations/swiglu_fusion.hpp" -#include "plugin/transformations/transpose_matmul_fusion.hpp" +#include "plugin/transformations/transpose_fusion.hpp" #include "plugin/transformations/indirect_kv_cache.hpp" #include "plugin/transformations/convert_convolution.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" +#include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp" #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp" #include "transformations/common_optimizations/broadcast_transition.hpp" #include "transformations/common_optimizations/common_optimizations.hpp" @@ -134,6 +135,7 @@ #include "transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp" #include "transformations/op_conversions/softmax_decomposition.hpp" #include "transformations/op_conversions/softplus_decomposition.hpp" +#include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp" #include "transformations/opset_conversions/convert_opset2_to_opset1.hpp" #include "transformations/opset_conversions/convert_opset3_to_opset2.hpp" #include "transformations/resolve_names_collisions.hpp" @@ -303,6 +305,50 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); + pass_config->set_callback([&](const std::shared_ptr node){ + if (!config.get_property(ov::intel_gpu::hint::enable_sdpa_optimization)) + return false; + + auto sdpa = std::dynamic_pointer_cast(node); + const auto& query_ps = sdpa->get_input_partial_shape(0); + const auto& key_ps = sdpa->get_input_partial_shape(1); + const auto& value_ps = sdpa->get_input_partial_shape(2); + + // Known limitations: + // - SDPA impl could be slower in non-LLM scenarios than decomposed version + if (func->get_variables().size() == 0) + return false; + + // - The data type of SDPA should be fp16 + if (sdpa->get_output_element_type(0) != ov::element::f16) + return false; + + // - The number of dimensions for each input is expected to be 4 + if (query_ps.size() != 4 || key_ps.size() != 4 || value_ps.size() != 4) { + return false; + } + + // - The head size of all Q, K, and V inputs should be the same static value + if (query_ps[query_ps.size() - 1].is_dynamic() || key_ps[key_ps.size() - 1].is_dynamic() || value_ps[query_ps.size() - 1].is_dynamic()) { + return false; + } + + if (query_ps[query_ps.size() - 1].get_length() != key_ps[key_ps.size() - 1].get_length() || + query_ps[query_ps.size() - 1].get_length() != value_ps[query_ps.size() - 1].get_length()) { + return false; + } + + // - The head size should be divisible by 16 + const auto optimal_subgroup_size = 16; + if (query_ps[query_ps.size() - 1].is_dynamic() || + query_ps[query_ps.size() - 1].get_length() > 256 || + query_ps[query_ps.size() - 1].get_length() % optimal_subgroup_size != 0) { + return false; + } + + return true; + }); + manager.register_pass(); manager.register_pass(); @@ -749,10 +795,17 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); manager.register_pass(); + manager.register_pass(); + if (device_info.supports_immad) { + manager.get_pass_config()->disable(); + manager.get_pass_config()->disable(); + } + if (!device_info.supports_immad) { - manager.register_pass(); manager.register_pass(); } + manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 8a57759bff9413..66b8d3e70cab1f 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -50,6 +50,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::intel_gpu::hint::host_task_priority, ov::hint::Priority::MEDIUM), std::make_tuple(ov::intel_gpu::hint::queue_throttle, ov::intel_gpu::hint::ThrottleLevel::MEDIUM), std::make_tuple(ov::intel_gpu::hint::queue_priority, ov::hint::Priority::MEDIUM), + std::make_tuple(ov::intel_gpu::hint::enable_sdpa_optimization, false), std::make_tuple(ov::intel_gpu::enable_loop_unrolling, true), std::make_tuple(ov::intel_gpu::disable_winograd_convolution, false), std::make_tuple(ov::internal::exclusive_async_requests, false), diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp new file mode 100644 index 00000000000000..3b97cde5cfe636 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp @@ -0,0 +1,248 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/ov_tensor_utils.hpp" +#include "common_test_utils/test_enums.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" + + +#include "openvino/opsets/opset13.hpp" +#include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp" +#include "openvino/pass/manager.hpp" + +#include "openvino/op/parameter.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/matmul.hpp" + +#include "intel_gpu/runtime/execution_config.hpp" + +namespace { +using ov::test::InputShape; + +typedef std::tuple, // shape + bool, // is_causal + bool, // has_attn + bool, // has_scale + std::string // targetDevice + > ScaledAttnGPUTestParams; + +class ScaledAttnLayerGPUTest : public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj); + +protected: + void SetUp() override; + void generate_inputs(const std::vector& targetInputStaticShapes) override; + bool is_causal; + bool has_attn; + bool has_scale; +}; + +std::string ScaledAttnLayerGPUTest::getTestCaseName(const testing::TestParamInfo& obj) { + ov::element::Type inType; + std::vector inputShapes; + bool is_causal; + bool has_attn; + bool has_scale; + std::string targetDevice; + std::tie(inType, inputShapes, is_causal, has_attn, has_scale, targetDevice) = obj.param; + + std::ostringstream result; + result << "netPRC=" << inType << "_"; + result << "IS="; + for (const auto& inputShape : inputShapes) { + result << ov::test::utils::partialShape2str({inputShape.first}) << "_"; + } + result << "TS="; + for (const auto& shapes : inputShapes) { + for (const auto& shape : shapes.second) { + result << ov::test::utils::vec2str(shape); + result << "_"; + } + } + result << "is_causal=" << is_causal << "_"; + result << "has_attn=" << has_attn << "_"; + result << "has_scale=" << has_scale << "_"; + result << "trgDev=" << targetDevice; + + return result.str(); +} + +void ScaledAttnLayerGPUTest::SetUp() { + ov::element::Type inType; + std::vector inputShapes; + std::tie(inType, inputShapes, is_causal, has_attn, has_scale, targetDevice) = this->GetParam(); + + init_input_shapes(inputShapes); + ov::ParameterVector inputParams; + // q, k, v + inputParams.push_back(std::make_shared(inType, inputDynamicShapes[0])); + inputParams.push_back(std::make_shared(inType, inputDynamicShapes[1])); + inputParams.push_back(std::make_shared(inType, inputDynamicShapes[1])); + inputParams[0]->set_friendly_name("q"); + inputParams[1]->set_friendly_name("k"); + inputParams[2]->set_friendly_name("v"); + // special case: only scale but no attn + if (!has_attn && has_scale) { + // attention_mask:[1] + inputParams.push_back(std::make_shared(inType, ov::PartialShape{})); + inputParams.back()->set_friendly_name("attention_mask"); + // scale:[1] + inputParams.push_back(std::make_shared(inType, ov::PartialShape{1})); + inputParams.back()->set_friendly_name("scale"); + } else { + if (has_attn) { + inputParams.push_back(std::make_shared(inType, inputDynamicShapes[2])); + inputParams.back()->set_friendly_name("attention_mask"); + } + if (has_scale) { + // scale:[1] + inputParams.push_back(std::make_shared(inType, ov::PartialShape{1})); + inputParams.back()->set_friendly_name("scale"); + } + } + + // Add artificial read/value operations to the model to trigger the enabling of the SDPA operation + auto read_key = std::make_shared(inputParams.at(1), "v0"); + auto assign_key = std::make_shared(read_key, "v0"); + + auto read_value = std::make_shared(inputParams.at(2), "v0"); + auto assign_value = std::make_shared(read_value, "v0"); + + ov::OutputVector inputs; + for (size_t i = 0; i < inputParams.size(); i++) { + if (i == 1) + inputs.push_back(read_key); + else if (i == 2) + inputs.push_back(read_value); + else + inputs.push_back(inputParams[i]); + } + + auto sdp = std::make_shared(inputs, is_causal); + sdp->set_friendly_name("sdpa"); + + auto output = std::make_shared(sdp->output(0)); + + function = std::make_shared(ov::OutputVector{output}, ov::SinkVector{assign_key, assign_value}, inputParams, "sdpa_model"); + + functionRefs = function->clone(); + ov::pass::Manager manager; + + // Decompose ScaledDotProductAttention + manager.register_pass(); + manager.run_passes(functionRefs); + + // Enable SDPA + configuration.insert(ov::intel_gpu::hint::enable_sdpa_optimization(true)); + + auto it = std::find_if(inputShapes[1].second.begin(), inputShapes[1].second.end(), [&](const ov::Shape& shape){ + return shape[2] >= 384; + }); + + bool has_long_seq = it != inputShapes[1].second.end(); + if (inType == ov::element::f16) { + if (has_long_seq) { + abs_threshold = 0.025; + rel_threshold = 0.025; + } else { + abs_threshold = 0.005; + rel_threshold = 0.005; + } + } +} + +void ScaledAttnLayerGPUTest::generate_inputs(const std::vector& targetInputStaticShapes) { + std::vector shapes(3); + shapes[0] = targetInputStaticShapes[0]; + shapes[1] = targetInputStaticShapes[1]; + shapes[2] = targetInputStaticShapes[1]; + if (!has_attn && has_scale) { + shapes.push_back(ov::Shape{}); + shapes.push_back(ov::Shape{1}); + } else { + if (has_attn) { + shapes.push_back(targetInputStaticShapes[2]); + } + if (has_scale) { + shapes.push_back(ov::Shape{1}); + } + } + SubgraphBaseTest::generate_inputs(shapes); +} + +TEST_P(ScaledAttnLayerGPUTest, CompareWithRefs) { + ov::element::Type inType; + std::vector inputShapes; + bool is_causal; + bool has_attn; + bool has_scale; + std::string targetDevice; + std::tie(inType, inputShapes, is_causal, has_attn, has_scale, targetDevice) = this->GetParam(); + run(); +} + +const std::vector> shapes{ + // normal case, shapes of q,k,v are same + { + // q shape + {ov::test::InputShape{ov::PartialShape{-1, 8, -1, 64}, + {ov::Shape{1, 8, 100, 64}, ov::Shape{1, 8, 1, 64}, ov::Shape{2, 8, 10, 64}}} + }, + // kv shape + {ov::test::InputShape{ov::PartialShape{-1, 8, -1, 64}, + {ov::Shape{1, 8, 100, 64}, ov::Shape{1, 8, 1, 64}, ov::Shape{2, 8, 10, 64}}} + }, + // attn shape: [B, 1, -1, L0+L1] + {ov::test::InputShape{ov::PartialShape{-1, 1, -1, -1}, + {ov::Shape{1, 1, 100, 100}, ov::Shape{1, 1, 1, 1}, ov::Shape{2, 1, 10, 10}}} + }, + }, + { + // q shape + {ov::test::InputShape{ov::PartialShape{-1, 5, -1, 64}, + {ov::Shape{2, 5, 100, 64}, ov::Shape{2, 5, 1, 64}, ov::Shape{2, 5, 384, 64}}} + }, + // kv shape + {ov::test::InputShape{ov::PartialShape{-1, 5, -1, 64}, + {ov::Shape{2, 5, 100, 64}, ov::Shape{2, 5, 1, 64}, ov::Shape{2, 5, 384, 64}}} + }, + // attn shape: [B, 1, -1, L0+L1] + {ov::test::InputShape{ov::PartialShape{-1, 1, -1, -1}, + {ov::Shape{1, 1, 100, 100}, ov::Shape{1, 1, 1, 1}, ov::Shape{2, 1, 384, 384}}} + }, + }, + // heads number of kv is 1, attn mask: [B, H, L1, L0+L1] + { + // q shape + {ov::test::InputShape{ov::PartialShape{-1, 8, -1, 64}, + {ov::Shape{1, 8, 100, 64}, ov::Shape{1, 8, 1, 64}, ov::Shape{2, 8, 10, 64}}} + }, + // kv shape + {ov::test::InputShape{ov::PartialShape{-1, 1, -1, 64}, + {ov::Shape{1, 1, 100, 64}, ov::Shape{1, 1, 1, 64}, ov::Shape{2, 1, 10, 64}}} + }, + // attn shape + {ov::test::InputShape{ov::PartialShape{-1, 8, -1, -1}, + {ov::Shape{1, 8, 100, 100}, ov::Shape{1, 8, 1, 1}, ov::Shape{2, 8, 10, 10}}} + }, + }, +}; + +const auto params = testing::Combine(testing::Values(ov::element::f16 /*, ov::element::f32 */), + testing::ValuesIn(shapes), + testing::Values(true, false), + testing::Values(true, false), + testing::Values(true, false), + testing::Values(ov::test::utils::DEVICE_GPU)); + +INSTANTIATE_TEST_SUITE_P(smoke_ScaledAttn_GPU, + ScaledAttnLayerGPUTest, + params, + ScaledAttnLayerGPUTest::getTestCaseName); + +} // namespace diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt index 3e6e89c870f625..8bff8f56c50156 100644 --- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt @@ -57,7 +57,7 @@ ov_set_threading_interface_for(${TARGET_NAME}) # Workaround to avoid warnings during LTO build if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized") + set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS_RELEASE "-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized -Wno-stringop-overflow") endif() set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index 8ce9e294a867fe..180d8cdb036483 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -2527,150 +2527,302 @@ INSTANTIATE_TEST_SUITE_P(gemm_gpu, gemm_onednn_ndims, ::testing::ValuesIn(std::v gemm_onednn_test_params{ CASE_GEMM_ONEDNN_I8_6D }, })); -TEST(gemm_onednn, impl_replacement_with_cldnn) { - auto& engine = get_test_engine(); +class gemm_onednn: public ::testing::Test { +public: + void test_impl_replacement_with_cldnn() { + auto& engine = get_test_engine(); - if (!engine.get_device_info().supports_immad) - return; + if (!engine.get_device_info().supports_immad) + return; + + ov::Shape in1_shape = { 1, 1, 3, 4 }; + ov::Shape in2_shape = { 1, 4 }; + auto in1_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::f32, format::bfyx}; + auto in2_layout = layout{ov::PartialShape::dynamic(in2_shape.size()), data_types::f32, format::bfyx}; + auto input1 = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f32, format::bfyx}); + auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx}); + + std::vector input1_data = { + 1.f, -2.f, 3.f, -4.f, + 5.f, 6.f, 1.f, 2.f, + 3.f, 3.f, 2.f, -1.f, + }; + + std::vector input2_data = { + 2.f, 5.f, -4.f, -7.f, + }; + set_values(input1, input1_data); + set_values(input2, input2_data); + + std::vector out_data = { + 8.f, 22.f, 20.f + }; + + topology topology; + topology.add(input_layout("input1", in1_layout), + input_layout("input2", in2_layout), + gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, true, 1.0f, 0.0f, 4, 2) + ); + + ov::intel_gpu::ImplementationDesc fc_impl = { format::bfyx, "", impl_types::onednn }; + ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order), + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", fc_impl} }), + ov::intel_gpu::optimize_data(true), + ov::intel_gpu::allow_new_shape_infer(true) }; + + network network(engine, topology, cfg); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + + auto inst = network.get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto outputs = network.execute(); + + auto output = outputs.at("gemm").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + ASSERT_EQ(output_ptr.size(), (uint32_t)3); + for (uint32_t i = 0; i < out_data.size(); ++i) { + ASSERT_FLOAT_EQ(output_ptr[i], out_data[i]); + } + + // WA: Call wait_all() to wait for all queued kernels compilation finish + network.get_program()->get_compilation_context().wait_all(); - ov::Shape in1_shape = { 1, 1, 3, 4 }; - ov::Shape in2_shape = { 1, 4 }; - auto in1_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::f32, format::bfyx}; - auto in2_layout = layout{ov::PartialShape::dynamic(in2_shape.size()), data_types::f32, format::bfyx}; - auto input1 = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f32, format::bfyx}); - auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx}); - - std::vector input1_data = { - 1.f, -2.f, 3.f, -4.f, - 5.f, 6.f, 1.f, 2.f, - 3.f, 3.f, 2.f, -1.f, - }; - - std::vector input2_data = { - 2.f, 5.f, -4.f, -7.f, - }; - set_values(input1, input1_data); - set_values(input2, input2_data); - - std::vector out_data = { - 8.f, 22.f, 20.f - }; - - topology topology; - topology.add(input_layout("input1", in1_layout), - input_layout("input2", in2_layout), - gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, true, 1.0f, 0.0f, 4, 2) - ); - - ov::intel_gpu::ImplementationDesc fc_impl = { format::bfyx, "", impl_types::onednn }; - ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order), - ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", fc_impl} }), - ov::intel_gpu::optimize_data(true), - ov::intel_gpu::allow_new_shape_infer(true) }; - - network network(engine, topology, cfg); - network.set_input_data("input1", input1); - network.set_input_data("input2", input2); - - auto inst = network.get_primitive("gemm"); - auto impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_TRUE(impl->is_dynamic()); - - auto outputs = network.execute(); - - auto output = outputs.at("gemm").get_memory(); - cldnn::mem_lock output_ptr(output, get_test_stream()); - - ASSERT_EQ(output_ptr.size(), (uint32_t)3); - for (uint32_t i = 0; i < out_data.size(); ++i) { - ASSERT_FLOAT_EQ(output_ptr[i], out_data[i]); + // Check if OneDNN's impl is used for the next execute() call + network.execute(); + inst = network.get_primitive("gemm"); + impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_FALSE(impl->is_dynamic()); } - // WA: Call wait_all() to wait for all queued kernels compilation finish - network.get_program()->get_compilation_context().wait_all(); + void test_check_transpose_format(const std::vector& permute_order) { + auto& engine = get_test_engine(); + tests::random_generator rg; + rg.set_seed(GET_SUITE_NAME); - // Check if OneDNN's impl is used for the next execute() call - network.execute(); - inst = network.get_primitive("gemm"); - impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_FALSE(impl->is_dynamic()); -} + if (!engine.get_device_info().supports_immad) + return; -// Check gemm_onednn transpose_format() can accept transpose white list format (byfx/bxfy) -TEST(gemm_onednn, check_transpose_format_byfx) { - auto& engine = get_test_engine(); - tests::random_generator rg; - rg.set_seed(GET_SUITE_NAME); + auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); + auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); - if (!engine.get_device_info().supports_immad) - return; + topology topology; + topology.add(input_layout("input0", input0->get_layout())); + topology.add(permute("permute0", input_info("input0"), permute_order)); + topology.add(input_layout("input1", input1->get_layout())); + topology.add(permute("permute1", input_info("input1"), permute_order)); + topology.add(gemm("gemm", { input_info("permute0"), input_info("permute1") }, data_types::f16, false, true)); + + ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::onednn }; + ExecutionConfig config{ ov::intel_gpu::queue_type(QueueTypes::in_order), + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", impl} }), + ov::intel_gpu::optimize_data(true), + ov::intel_gpu::allow_new_shape_infer(false) }; + network network(engine, topology, config); + + auto input0_data = rg.generate_random_1d(input0->get_layout().count(), -1, 1); + auto input1_data = rg.generate_random_1d(input1->get_layout().count(), -1, 1); + + set_values(input0, input0_data); + set_values(input1, input1_data); - auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); - auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); + network.set_input_data("input0", input0); + network.set_input_data("input1", input1); - topology topology; - topology.add(input_layout("input0", input0->get_layout())); - topology.add(permute("permute0", input_info("input0"), {0, 2, 1, 3})); - topology.add(input_layout("input1", input1->get_layout())); - topology.add(permute("permute1", input_info("input1"), {0, 2, 1, 3})); - topology.add(gemm("gemm", { input_info("permute0"), input_info("permute1") }, data_types::f16, false, true)); + ASSERT_NO_FATAL_FAILURE(network.execute()); + } - ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::onednn }; - ExecutionConfig config{ ov::intel_gpu::queue_type(QueueTypes::in_order), - ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", impl} }), - ov::intel_gpu::optimize_data(true), - ov::intel_gpu::allow_new_shape_infer(false) }; - network network(engine, topology, config); + void test_dynamic_padding(bool n_dim_only) { + tests::random_generator rg; + rg.set_seed(GET_SUITE_NAME); - auto input0_data = rg.generate_random_1d(input0->get_layout().count(), -1, 1); - auto input1_data = rg.generate_random_1d(input1->get_layout().count(), -1, 1); + auto& engine = get_test_engine(); - set_values(input0, input0_data); - set_values(input1, input1_data); + if (!engine.get_device_info().supports_immad) + return; - network.set_input_data("input0", input0); - network.set_input_data("input1", input1); + const unsigned long BATCH_SIZE = 31; + const unsigned long M_SIZE = 11; + const unsigned long K_SIZE = 37; + const unsigned long N_SIZE = 49; - ASSERT_NO_FATAL_FAILURE(network.execute()); -} + auto fill_mem = [&](cldnn::memory_ptr mem, std::vector& data) { + cldnn::mem_lock mem_ptr(mem, get_test_stream()); + auto&& l = mem->get_layout(); + auto data_idx = 0; + for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) { + for (cldnn::tensor::value_type f = 0; f < l.feature(); ++f) { + for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) { + for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) { + auto tensor_coord = cldnn::tensor{{b, f, x, y}, 0}; + auto buffer_idx = l.get_linear_offset(tensor_coord); + mem_ptr[buffer_idx] = data[data_idx++]; + } + } + } + } + }; -TEST(gemm_onednn, check_transpose_format_bxfy) { - auto& engine = get_test_engine(); - tests::random_generator rg; - rg.set_seed(GET_SUITE_NAME); + const auto align_size_m = 13; + const auto align_size_k = 16; + const auto align_size_n = 15; + const auto align_size_b1 = 3; + const auto align_size_b2 = 19; - if (!engine.get_device_info().supports_immad) - return; + const auto aligned_batch1_size = align_to(1ul, align_size_b1); + auto padding_size_batch1 = static_cast(aligned_batch1_size - 1); - auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); - auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 128, 64, 12 } }); + const auto aligned_batch2_size = align_to(BATCH_SIZE, align_size_b2); + auto padding_size_batch2 = static_cast(aligned_batch2_size - BATCH_SIZE); + + const auto aligned_m_size = align_to(M_SIZE, align_size_m); + auto padding_size_m = static_cast(aligned_m_size - M_SIZE); + const auto aligned_k_size = align_to(K_SIZE, align_size_k); + auto padding_size_k = static_cast(aligned_k_size - K_SIZE); + const auto aligned_n_size = align_to(N_SIZE, align_size_n); + auto padding_size_n = static_cast(aligned_n_size - N_SIZE); + + ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE }; + ov::Shape in2_shape = { 1, BATCH_SIZE, K_SIZE, N_SIZE }; + ov::Shape in1_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_m_size, aligned_k_size }; + ov::Shape in2_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_k_size, aligned_n_size }; + + // Use dynamic padding for all BFYX dimensions + tensor dyn_pad_dims_input1({0, 0, 0, 0}, 0); + tensor dyn_pad_dims_input2({0, 0, 0, 0}, 0); + + if (n_dim_only) { + dyn_pad_dims_input1 = tensor({0, 0, 0, 0}, 0); + dyn_pad_dims_input2 = tensor({0, 0, 1, 0}, 0); + } else { + dyn_pad_dims_input1 = tensor({1, 1, 1, 1}, 0); + dyn_pad_dims_input2 = tensor({1, 1, 1, 1}, 0); + } + + auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1)}; + auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input2)}; + + auto aligned_input1_mem = engine.allocate_memory({ov::PartialShape(in1_shape_aligned), data_types::f16, format::bfyx}); + auto aligned_input2_mem = engine.allocate_memory({ov::PartialShape(in2_shape_aligned), data_types::f16, format::bfyx}); + + auto input1_mem = engine.reinterpret_buffer(*aligned_input1_mem, layout{ov::PartialShape(in1_shape), + data_types::f16, + format::bfyx, + n_dim_only ? padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1) : + padding({0, 0, 0, 0}, {padding_size_batch1, padding_size_batch2, padding_size_k, padding_size_m}, 0.0f, dyn_pad_dims_input1)}); + + auto input2_mem = engine.reinterpret_buffer(*aligned_input2_mem, layout{ov::PartialShape(in2_shape), + data_types::f16, + format::bfyx, + n_dim_only ? padding({0, 0, 0, 0}, {0, 0, padding_size_n, 0}, 0.0f, dyn_pad_dims_input2) : + padding({0, 0, 0, 0}, {padding_size_batch1, padding_size_batch2, padding_size_n, padding_size_k}, 0.0f, dyn_pad_dims_input2)}); + + auto input_1_data = rg.generate_random_1d(ov::shape_size(in1_shape), -2, 2); + auto input_2_data = rg.generate_random_1d(ov::shape_size(in2_shape), -2, 2); + + fill_mem(input1_mem, input_1_data); + fill_mem(input2_mem, input_2_data); + + auto get_ref_results = [&]() { + ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE }; + ov::Shape in2_shape = { 1, BATCH_SIZE, K_SIZE, N_SIZE }; + auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx}; + auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx}; + + auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f16, format::bfyx}); + auto input2_mem = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f16, format::bfyx}); + + fill_mem(input1_mem, input_1_data); + fill_mem(input2_mem, input_2_data); + + topology topology; + topology.add(input_layout("input1", in1_layout), + input_layout("input2", in2_layout), + gemm("gemm_ref", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4), + permute("permute", input_info("gemm_ref"), {0, 2, 1, 3}), + reorder("reorder", input_info("permute"), format::bfyx, data_types::f32) + ); + + ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, std::string(""), impl_types::onednn }; + ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order), + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", gemm_impl} }), + ov::intel_gpu::optimize_data(true), + ov::intel_gpu::allow_new_shape_infer(true) }; + + network network(engine, topology, cfg); + network.set_input_data("input1", input1_mem); + network.set_input_data("input2", input2_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "reorder"); + + auto inst = network.get_primitive("reorder"); + + auto output_mem = outputs.at("reorder").get_memory(); + auto output_layout = outputs.at("reorder").get_layout(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; - topology topology; - topology.add(input_layout("input0", input0->get_layout())); - topology.add(permute("permute0", input_info("input0"), {0, 3, 1, 2})); - topology.add(input_layout("input1", input1->get_layout())); - topology.add(permute("permute1", input_info("input1"), {0, 3, 1, 2})); - topology.add(gemm("gemm", { input_info("permute0"), input_info("permute1") }, data_types::f16, false, true)); + topology topology; + topology.add(input_layout("input1", in1_layout), + input_layout("input2", in2_layout), + gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4), + permute("permute", input_info("gemm"), {0, 2, 1, 3}), + reorder("reorder", input_info("permute"), format::bfyx, data_types::f32) + ); + + ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, std::string(""), impl_types::onednn }; + ExecutionConfig cfg{ ov::intel_gpu::queue_type(QueueTypes::in_order), + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", gemm_impl} }), + ov::intel_gpu::optimize_data(true), + ov::intel_gpu::allow_new_shape_infer(true) }; + network network(engine, topology, cfg); + network.set_input_data("input1", input1_mem); + network.set_input_data("input2", input2_mem); - ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::onednn }; - ExecutionConfig config{ ov::intel_gpu::queue_type(QueueTypes::in_order), - ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", impl} }), - ov::intel_gpu::optimize_data(true), - ov::intel_gpu::allow_new_shape_infer(false) }; - network network(engine, topology, config); + auto outputs = network.execute(); - auto input0_data = rg.generate_random_1d(input0->get_layout().count(), -1, 1); - auto input1_data = rg.generate_random_1d(input1->get_layout().count(), -1, 1); + auto output_mem = outputs.at("reorder").get_memory(); + auto output_layout = outputs.at("reorder").get_layout(); - set_values(input0, input0_data); - set_values(input1, input1_data); + auto res = engine.reinterpret_buffer(*output_mem, output_layout); - network.set_input_data("input0", input0); - network.set_input_data("input1", input1); + auto ref_res = get_ref_results(); + + mem_lock res_lock(res, get_test_stream()); + mem_lock res_ref_lock(ref_res, get_test_stream()); + for (size_t i = 0; i < res->count(); i++) { + ASSERT_EQ(res_lock[i], res_ref_lock[i]) << i; + } + } +}; + +TEST_F(gemm_onednn, impl_replacement_with_cldnn) { + this->test_impl_replacement_with_cldnn(); +} + +// Check gemm_onednn transpose_format() can accept transpose white list format (byfx/bxfy) +TEST_F(gemm_onednn, check_transpose_format_byfx) { + this->test_check_transpose_format({0, 2, 1, 3}); +} + +TEST_F(gemm_onednn, check_transpose_format_bxfy) { + this->test_check_transpose_format({0, 3, 1, 2}); +} + +TEST_F(gemm_onednn, dynamic_padding_all_dim) { + this->test_dynamic_padding(false); +} - ASSERT_NO_FATAL_FAILURE(network.execute()); +TEST_F(gemm_onednn, dynamic_padding_n_dim_only) { + this->test_dynamic_padding(true); } template diff --git a/src/plugins/intel_gpu/tests/unit/transformations/transpose_matmul_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/transpose_matmul_fusion_test.cpp index 61638930c3b63f..f97ac8f9c433a1 100644 --- a/src/plugins/intel_gpu/tests/unit/transformations/transpose_matmul_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/transformations/transpose_matmul_fusion_test.cpp @@ -13,7 +13,7 @@ #include "openvino/op/result.hpp" #include "intel_gpu/op/gemm.hpp" -#include "plugin/transformations/transpose_matmul_fusion.hpp" +#include "plugin/transformations/transpose_fusion.hpp" #include @@ -31,7 +31,7 @@ TEST_F(TransformationTestsF, TranposeMatmulFusion1) { auto matmul = std::make_shared(input_a, input_b); model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input_a, input_b }); - manager.register_pass(); + manager.register_pass(); } { std::vector order_a = {0, 1, 2, 3}; @@ -55,7 +55,7 @@ TEST_F(TransformationTestsF, TranposeMatmulFusion2) { auto matmul = std::make_shared(tranpose_a, input_b); model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input_a, input_b }); - manager.register_pass(); + manager.register_pass(); } { std::vector order_a = {0, 2, 1, 3}; @@ -81,7 +81,7 @@ TEST_F(TransformationTestsF, TranposeMatmulFusion3) { auto matmul = std::make_shared(tranpose_a, tranpose_b); model = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input_a, input_b }); - manager.register_pass(); + manager.register_pass(); } { std::vector order_a = {0, 2, 1, 3}; @@ -109,7 +109,7 @@ TEST_F(TransformationTestsF, TranposeMatmulFusion4) { auto tranpose_c = std::make_shared(matmul, tranpose_c_const); model = std::make_shared(ov::NodeVector{ tranpose_c }, ov::ParameterVector{ input_a, input_b }); - manager.register_pass(); + manager.register_pass(); } { std::vector order_a = {0, 2, 1, 3}; diff --git a/src/plugins/intel_gpu/tests/unit/transformations/transpose_sdpa_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/transpose_sdpa_fusion_test.cpp new file mode 100644 index 00000000000000..ebe15f4d806b31 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/transpose_sdpa_fusion_test.cpp @@ -0,0 +1,178 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/ov_test_utils.hpp" + +#include "openvino/core/model.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/pass/manager.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/result.hpp" +#include "intel_gpu/op/sdpa.hpp" + +#include "plugin/transformations/transpose_fusion.hpp" + +#include + +using namespace testing; +using namespace ov::intel_gpu; + +namespace ov { +namespace test { +namespace intel_gpu { + +TEST_F(TransformationTestsF, TranposeSDPAFusion1) { + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto sdpa = std::make_shared(input_a, input_b, input_c, true); + + model = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + manager.register_pass(); + } + { + std::vector order_a = {0, 1, 2, 3}; + std::vector order_b = {0, 1, 2, 3}; + std::vector order_c = {0, 1, 2, 3}; + std::vector order_output = {0, 1, 2, 3}; + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto sdpa = std::make_shared(input_a, input_b, input_c, order_a, order_b, order_c, order_output, true, ov::element::undefined ); + + model_ref = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, TranposeSDPAFusion2) { + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3}); + auto tranpose_a = std::make_shared(input_a, tranpose_a_const); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto sdpa = std::make_shared(tranpose_a, input_b, input_c, true); + + model = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + manager.register_pass(); + } + { + std::vector order_a = {0, 2, 1, 3}; + std::vector order_b = {0, 1, 2, 3}; + std::vector order_c = {0, 1, 2, 3}; + std::vector order_output = {0, 1, 2, 3}; + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto sdpa = std::make_shared(input_a, input_b, input_c, order_a, order_b, order_c, order_output, true, ov::element::undefined); + + model_ref = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, TranposeSDPAFusion3) { + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3}); + auto tranpose_a = std::make_shared(input_a, tranpose_a_const); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 2, 0, 3}); + auto tranpose_b = std::make_shared(input_b, tranpose_b_const); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + + auto sdpa = std::make_shared(tranpose_a, tranpose_b, input_c, false); + + model = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + manager.register_pass(); + } + { + std::vector order_a = {0, 2, 1, 3}; + std::vector order_b = {1, 2, 0, 3}; + std::vector order_c = {0, 1, 2, 3}; + std::vector order_output = {0, 1, 2, 3}; + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto sdpa = std::make_shared(input_a, input_b, input_c, order_a, order_b, order_c, order_output, false, ov::element::undefined); + + model_ref = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, TranposeSDPAFusion4) { + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3}); + auto tranpose_a = std::make_shared(input_a, tranpose_a_const); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3}); + auto tranpose_b = std::make_shared(input_b, tranpose_b_const); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_c_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3}); + auto tranpose_c = std::make_shared(input_c, tranpose_c_const); + + auto sdpa = std::make_shared(tranpose_a, tranpose_b, tranpose_c, false); + + model = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + manager.register_pass(); + } + { + std::vector order_a = {0, 2, 1, 3}; + std::vector order_b = {0, 2, 1, 3}; + std::vector order_c = {0, 2, 1, 3}; + std::vector order_output = {0, 1, 2, 3}; + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto sdpa = std::make_shared(input_a, input_b, input_c, order_a, order_b, order_c, order_output, false, ov::element::undefined); + + model_ref = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, TranposeSDPAFusion5) { + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3}); + auto tranpose_a = std::make_shared(input_a, tranpose_a_const); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3}); + auto tranpose_b = std::make_shared(input_b, tranpose_b_const); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_c_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {3, 2, 1, 0}); + auto tranpose_c = std::make_shared(input_c, tranpose_c_const); + + auto sdpa = std::make_shared(tranpose_a, tranpose_b, tranpose_c, false); + + model = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + manager.register_pass(); + } + { + auto input_a = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3}); + auto tranpose_a = std::make_shared(input_a, tranpose_a_const); + auto input_b = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 2, 1, 3}); + auto tranpose_b = std::make_shared(input_b, tranpose_b_const); + auto input_c = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto tranpose_c_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {3, 2, 1, 0}); + auto tranpose_c = std::make_shared(input_c, tranpose_c_const); + + auto sdpa = std::make_shared(tranpose_a, tranpose_b, tranpose_c, false); + + model_ref = std::make_shared(ov::NodeVector{ sdpa }, ov::ParameterVector{ input_a, input_b, input_c }); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +} // namespace intel_gpu +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp index 587d17be09adcb..7f4524ec8127ca 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp @@ -7,6 +7,7 @@ #include #include +#include "intel_npu/utils/logger/logger.hpp" #include "npu.hpp" #include "zero_init.hpp" @@ -30,6 +31,7 @@ class ZeroEngineBackend final : public IEngineBackend { std::shared_ptr _instance; std::map> _devices{}; + Logger _logger; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp index 0cbd12e91878f2..94a87ab725dae6 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp @@ -11,13 +11,15 @@ namespace intel_npu { -ZeroEngineBackend::ZeroEngineBackend(const Config& config) { +ZeroEngineBackend::ZeroEngineBackend(const Config& config) : _logger("ZeroEngineBackend", config.get()) { + _logger.debug("ZeroEngineBackend - initialize started"); Logger::global().setLevel(config.get()); _instance = std::make_shared(); auto device = std::make_shared(_instance); _devices.emplace(std::make_pair(device->getName(), device)); + _logger.debug("ZeroEngineBackend - initialize completed"); } uint32_t ZeroEngineBackend::getDriverVersion() const { @@ -36,8 +38,10 @@ ZeroEngineBackend::~ZeroEngineBackend() = default; const std::shared_ptr ZeroEngineBackend::getDevice() const { if (_devices.empty()) { + _logger.debug("ZeroEngineBackend - getDevice() returning empty list"); return {}; } else { + _logger.debug("ZeroEngineBackend - getDevice() returning device list"); return _devices.begin()->second; } } @@ -48,10 +52,12 @@ const std::shared_ptr ZeroEngineBackend::getDevice(const std::string& / } const std::vector ZeroEngineBackend::getDeviceNames() const { + _logger.debug("ZeroEngineBackend - getDeviceNames started"); std::vector devicesNames; std::for_each(_devices.cbegin(), _devices.cend(), [&devicesNames](const auto& device) { devicesNames.push_back(device.first); }); + _logger.debug("ZeroEngineBackend - getDeviceNames completed and returning result"); return devicesNames; } diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp index cfa0cdaef34713..a29261bffe7d65 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp @@ -16,6 +16,7 @@ ZeroDevice::ZeroDevice(const std::shared_ptr& initStructs : _initStructs(initStructs), _graph_ddi_table_ext(_initStructs->getGraphDdiTable()), log("ZeroDevice", Logger::global().level()) { + log.debug("ZeroDevice::ZeroDevice init"); device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; zeroUtils::throwOnFail("zeDeviceGetProperties", zeDeviceGetProperties(_initStructs->getDevice(), &device_properties)); @@ -70,6 +71,7 @@ ZeroDevice::ZeroDevice(const std::shared_ptr& initStructs "zeDeviceGetCommandQueueGroupProperties", zeDeviceGetCommandQueueGroupProperties(_initStructs->getDevice(), &command_queue_group_count, nullptr)); + log.debug("ZeroDevice::ZeroDevice - resize command_queue_group_count"); command_group_properties.resize(command_queue_group_count); for (auto& prop : command_group_properties) { @@ -83,7 +85,9 @@ ZeroDevice::ZeroDevice(const std::shared_ptr& initStructs command_group_properties.data())); // Find the corresponding command queue group. + log.debug("ZeroDevice::ZeroDevice - findGroupOrdinal"); _group_ordinal = zeroUtils::findGroupOrdinal(command_group_properties, device_properties); + log.debug("ZeroDevice::ZeroDevice - init completed"); } std::shared_ptr ZeroDevice::createExecutor( diff --git a/src/plugins/intel_npu/src/backend/src/zero_executor.cpp b/src/plugins/intel_npu/src/backend/src/zero_executor.cpp index 468fbf6d95c761..4882a552155883 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_executor.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_executor.cpp @@ -44,19 +44,23 @@ ZeroExecutor::ZeroExecutor(const std::shared_ptr& i zeroUtils::toZeQueuePriority(_config.get()), _config, group_ordinal)}} { + _logger.debug("ZeroExecutor::ZeroExecutor - create graph_command_list"); OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Executor::ZeroExecutor"); CommandList graph_command_list(_initStructs->getDevice(), _initStructs->getContext(), _initStructs->getGraphDdiTable(), _config, _group_ordinal); + _logger.debug("ZeroExecutor::ZeroExecutor - create graph_command_queue"); CommandQueue graph_command_queue(_initStructs->getDevice(), _initStructs->getContext(), ZE_COMMAND_QUEUE_PRIORITY_NORMAL, _config, _group_ordinal); + _logger.debug("ZeroExecutor::ZeroExecutor - create fence"); Fence fence(graph_command_queue, _config); + _logger.debug("ZeroExecutor::ZeroExecutor - create graph"); OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_GRAPH, itt::domains::LevelZeroBackend, "Executor::ZeroExecutor", "graphCreate"); ze_graph_desc_t desc{ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, @@ -79,6 +83,7 @@ ZeroExecutor::ZeroExecutor(const std::shared_ptr& i } OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "pfnGetArgumentProperties3"); + _logger.debug("ZeroExecutor::ZeroExecutor - performing pfnGetArgumentProperties3"); for (uint32_t index = 0; index < _props.numGraphArgs; ++index) { ze_graph_argument_properties_3_t arg3; zeroUtils::throwOnFail("pfnGetArgumentProperties3", @@ -104,12 +109,17 @@ ZeroExecutor::ZeroExecutor(const std::shared_ptr& i } OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "appendGraphInitialize"); + _logger.debug("ZeroExecutor::ZeroExecutor - performing appendGraphInitialize"); graph_command_list.appendGraphInitialize(_graph); + _logger.debug("ZeroExecutor::ZeroExecutor - closing graph command list"); graph_command_list.close(); OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "queue_execute"); + _logger.debug("ZeroExecutor::ZeroExecutor - performing executeCommandList"); graph_command_queue.executeCommandList(graph_command_list, fence); + _logger.debug("ZeroExecutor::ZeroExecutor - performing hostSynchronize"); fence.hostSynchronize(); + _logger.debug("ZeroExecutor::ZeroExecutor - hostSynchronize completed"); } void ZeroExecutor::setArgumentValue(uint32_t argi_, const void* argv_) const { diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index 66b3e43017237c..b03981e0448769 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -155,6 +155,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& _profilingQuery(0, _executor->getInitStructs()->getDevice(), _executor->getInitStructs()->getProfilingDdiTable()) { + _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest"); const std::unordered_map& executorInputDescriptors = _executor->inputs_desc_map(); const std::unordered_map& executorOutputDescriptors = @@ -162,6 +163,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& auto proftype = config.get(); if (proftype == ov::intel_npu::ProfilingType::INFER) { + _logger.debug("ZeroInferRequest::ZeroInferRequest - profiling type == ov::intel_npu::ProfilingType::INFER"); _npuProfiling = std::make_shared(_executor->getInitStructs()->getContext(), _executor->getInitStructs()->getDevice(), _config.get()); @@ -178,6 +180,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& auto allocator = zeroMemory::HostMemAllocator(backendPtr); + _logger.debug("ZeroInferRequest::ZeroInferRequest - performing I/O buffer allocation using Level Zero API"); for (const std::string& inputName : _metadata.inputNames) { if (!executorInputDescriptors.count(inputName)) { OPENVINO_THROW("Invalid graph input descriptor key: " + inputName); @@ -230,6 +233,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& } } + _logger.debug("ZeroInferRequest::ZeroInferRequest - checking level zero attributes and allocate tensor"); for (const std::string& outputName : _metadata.outputNames) { IONodeDescriptor& resultDescriptor = _metadata.results.at(outputName); checkLevelZeroAttributesMatch(resultDescriptor, executorOutputDescriptors.at(outputName), outputName); @@ -257,6 +261,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& } } + _logger.debug("ZeroInferRequest::ZeroInferRequest - capturing latest tensor value in output"); for (const std::string& stateName : _metadata.stateNames) { const std::string& stateInputBufferName = READVALUE_PREFIX + stateName; const std::string& stateOutputBufferName = ASSIGN_PREFIX + stateName; @@ -281,6 +286,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& allocate_tensor(stateName, stateDescriptor, TensorType::State, allocator); } + _logger.debug("ZeroInferRequest::ZeroInferRequest - constructing pipeline"); /// Construct pipepline _pipeline = makePipeline(_executorPtr, _config, @@ -289,6 +295,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& _npuProfiling, _copyAllTensors, _batchSize); + _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest completed"); } void ZeroInferRequest::infer() { @@ -382,6 +389,10 @@ void ZeroInferRequest::check_network_precision(const ov::element::Type_t precisi break; case ov::element::Type_t::f16: break; + case ov::element::Type_t::u4: + break; + case ov::element::Type_t::i4: + break; case ov::element::Type_t::u8: break; case ov::element::Type_t::i8: @@ -400,11 +411,12 @@ void ZeroInferRequest::check_network_precision(const ov::element::Type_t precisi break; default: OPENVINO_THROW("Unsupported tensor precision: " + ov::element::Type(precision).get_type_name() + - "! Supported precisions: FP32, FP16, U8, I8, U16, I16, U32, I32, U64, I64"); + "! Supported precisions: FP32, FP16, U4, I4, U8, I8, U16, I16, U32, I32, U64, I64"); } } std::vector ZeroInferRequest::get_profiling_info() const { + _logger.debug("InferRequest::get_profiling_info started"); const auto& compiledModel = *std::dynamic_pointer_cast(_compiledModel); const auto& compilerConfig = compiledModel.get_config(); if (!compilerConfig.get() || !_config.get()) { @@ -428,6 +440,7 @@ std::vector ZeroInferRequest::get_profiling_info() const { return _profilingQuery.getLayerStatistics(); } } + _logger.debug("InferRequest::get_profiling_info completed"); } std::vector ZeroInferRequest::get_raw_profiling_data() const { diff --git a/src/plugins/intel_npu/src/backend/src/zero_init.cpp b/src/plugins/intel_npu/src/backend/src/zero_init.cpp index 05e1a76b8d5215..9efeda7b01f4a9 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_init.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_init.cpp @@ -62,6 +62,7 @@ static std::tuple queryDriverExtensionVersion(ze_driver_h ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", Logger::global().level()) { OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "ZeroInitStructsHolder::ZeroInitStructsHolder"); + log.debug("ZeroInitStructsHolder - performing zeInit on VPU only"); zeroUtils::throwOnFail("zeInit", zeInit(ZE_INIT_FLAG_VPU_ONLY)); uint32_t drivers = 0; @@ -72,6 +73,7 @@ ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", // Get our target driver driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES; + log.debug("ZeroInitStructsHolder - setting driver properties to ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES"); for (uint32_t i = 0; i < drivers; ++i) { zeDriverGetProperties(all_drivers[i], &driver_properties); @@ -105,6 +107,7 @@ ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", // Query our graph extension version std::string graph_ext_name; + log.debug("ZeroInitStructsHolder - tie output of queryDriverExtensionVersion"); std::tie(driver_ext_version, graph_ext_name) = queryDriverExtensionVersion(driver_handle); log.debug("Found Driver Version %d.%d, Driver Extension Version %d.%d (%s)", @@ -136,10 +139,12 @@ ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", ze_context_desc_t context_desc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, 0, 0}; zeroUtils::throwOnFail("zeContextCreate", zeContextCreate(driver_handle, &context_desc, &context)); + log.debug("ZeroInitStructsHolder initialize complete"); } ZeroInitStructsHolder::~ZeroInitStructsHolder() { if (context) { + log.debug("ZeroInitStructsHolder - performing zeContextDestroy"); auto result = zeContextDestroy(context); if (ZE_RESULT_SUCCESS != result) { log.error("zeContextDestroy failed %#X", uint64_t(result)); diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index 38bebff17de601..c34f5578e55120 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -35,7 +35,9 @@ struct DiscretePipeline final : public Pipeline { _event_pool(device_handle, context, stage::COUNT, _config), _event{{{_event_pool.handle(), stage::UPLOAD, _config}, {_event_pool.handle(), stage::EXECUTE, _config}, - {_event_pool.handle(), stage::READBACK, _config}}} { + {_event_pool.handle(), stage::READBACK, _config}}}, + _logger("DiscretePipeline", _config.get()) { + _logger.debug("DiscretePipeline - initialize started"); const ZeroExecutor* executor = static_cast(executorPtr.get()); static const std::size_t alignment = STANDARD_PAGE_SIZE; @@ -45,6 +47,8 @@ struct DiscretePipeline final : public Pipeline { } _deviceInputs.allocate(device_handle, context); + _logger.debug("DiscretePipeline - appending memory copy and set argument value for input"); + for (const auto& desc : executor->inputs_desc_map()) { const std::shared_ptr& inputTensor = tensors.at(desc.first); const void* tensorBuffer = reinterpret_cast(inputTensor->data()); @@ -57,6 +61,8 @@ struct DiscretePipeline final : public Pipeline { executor->setArgumentValue(desc.second.idx, _deviceInputs.getDevicePtr(desc.first)); } + _logger.debug("DiscretePipeline - append signal event"); + _command_list[stage::UPLOAD].appendBarrier(); _event[stage::UPLOAD].AppendSignalEvent(_command_list[stage::UPLOAD]); @@ -65,6 +71,7 @@ struct DiscretePipeline final : public Pipeline { } _deviceOutputs.allocate(device_handle, context); + _logger.debug("DiscretePipeline - appending memory copy and set argument value for output"); for (const auto& desc : executor->outputs_desc_map()) { const std::shared_ptr& outputTensor = tensors.at(desc.first); void* tensorBuffer = reinterpret_cast(outputTensor->data()); @@ -80,14 +87,15 @@ struct DiscretePipeline final : public Pipeline { } _event[stage::UPLOAD].AppendWaitOnEvent(_command_list[stage::EXECUTE]); - + _logger.debug("DiscretePipeline - appendGraphExecute"); _command_list[stage::EXECUTE].appendGraphExecute(executor->graph(), profiling_handle); - + _logger.debug("DiscretePipeline - appendEventReset"); _event[stage::UPLOAD].AppendEventReset(_command_list[stage::READBACK]); for (auto& commandList : _command_list) { commandList.close(); } + _logger.debug("DiscretePipeline - initialize completed"); }; DiscretePipeline(const DiscretePipeline&) = delete; @@ -95,6 +103,7 @@ struct DiscretePipeline final : public Pipeline { virtual ~DiscretePipeline() = default; void push(size_t) override { + _logger.debug("DiscretePipeline - push() started"); OV_ITT_TASK_CHAIN(ZERO_INFER_REQUEST_DP_PUSH, itt::domains::LevelZeroBackend, "DiscretePipeline::push", @@ -105,9 +114,11 @@ struct DiscretePipeline final : public Pipeline { OV_ITT_TASK_NEXT(ZERO_INFER_REQUEST_DP_PUSH, "EXECUTE"); // Submit the command list for execute _command_queues[stage::EXECUTE]->executeCommandList(_command_list[stage::EXECUTE], _fence[stage::EXECUTE]); + _logger.debug("DiscretePipeline - push() completed"); }; void pull(size_t) override { + _logger.debug("DiscretePipeline - pull() started"); OV_ITT_TASK_CHAIN(ZERO_INFER_REQUEST_DP_PULL, itt::domains::LevelZeroBackend, "DiscretePipeline::pull", @@ -120,6 +131,7 @@ struct DiscretePipeline final : public Pipeline { // Wait for output copy to finish execution for _fence from the host, to make sure that data // is available in the hostMem buffer of the output _fence[stage::READBACK].hostSynchronize(); + _logger.debug("DiscretePipeline - pull() completed"); }; void reset(size_t) const override { @@ -136,6 +148,7 @@ struct DiscretePipeline final : public Pipeline { std::array _fence; EventPool _event_pool; std::array _event; + Logger _logger; }; struct IntegratedPipeline final : public Pipeline { @@ -154,15 +167,18 @@ struct IntegratedPipeline final : public Pipeline { : _config(config), _command_queue{command_queue}, _event_pool{device_handle, context, batch_size ? static_cast(batch_size) : 1, _config}, - _npu_profiling(std::move(npu_profiling)) { + _npu_profiling(std::move(npu_profiling)), + _logger("IntegratedPipeline", _config.get()) { const ZeroExecutor* executor = static_cast(executorPtr.get()); OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::IntegratedPipeline::IntegratedPipeline"); + _logger.debug("IntegratedPipeline - initialize started"); _command_lists.reserve(batch_size); _events.reserve(batch_size); _fences.reserve(batch_size); + _logger.debug("IntegratedPipeline - emplace_back _event_pool and _command_queue"); for (size_t i = 0; i < batch_size; i++) { _command_lists.emplace_back( std::make_unique(device_handle, context, graph_ddi_table_ext, _config, group_ordinal)); @@ -209,6 +225,7 @@ struct IntegratedPipeline final : public Pipeline { } _command_lists.at(i)->close(); } + _logger.debug("IntegratedPipeline - initialize completed"); } IntegratedPipeline(const IntegratedPipeline&) = delete; @@ -216,15 +233,18 @@ struct IntegratedPipeline final : public Pipeline { virtual ~IntegratedPipeline() = default; void push(size_t batch_index) override { + _logger.debug("IntegratedPipeline - push() started"); OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PUSH, itt::domains::LevelZeroBackend, "IntegratedPipeline", "push"); if (sync_output_with_fences_) { _command_queue.executeCommandList(*_command_lists.at(batch_index), *_fences.at(batch_index)); } else { _command_queue.executeCommandList(*_command_lists.at(batch_index)); } + _logger.debug("IntegratedPipeline - push() completed"); }; void pull(size_t batch_index) override { + _logger.debug("IntegratedPipeline - pull() started"); OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PULL, itt::domains::LevelZeroBackend, "IntegratedPipeline", "pull"); if (sync_output_with_fences_) { _fences.at(batch_index)->hostSynchronize(); @@ -235,14 +255,17 @@ struct IntegratedPipeline final : public Pipeline { if (_npu_profiling != nullptr) { _npu_profiling->sampleNpuTimestamps(); } + _logger.debug("IntegratedPipeline - pull() completed"); }; void reset(size_t batch_index) const override { + _logger.debug("IntegratedPipeline - rest() started"); if (sync_output_with_fences_) { _fences.at(batch_index)->reset(); } else { _events.at(batch_index)->reset(); } + _logger.debug("IntegratedPipeline - rest() completed"); }; private: @@ -254,6 +277,7 @@ struct IntegratedPipeline final : public Pipeline { std::vector> _events; bool sync_output_with_fences_ = true; std::shared_ptr _npu_profiling; + Logger _logger; }; std::unique_ptr makePipeline(const std::shared_ptr& executorPtr, diff --git a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp b/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp index 2bb2f951d4d634..9607fdbdd7749e 100644 --- a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp +++ b/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp @@ -763,6 +763,8 @@ NetworkDescription LevelZeroCompilerInDriver::compileIR(const st getLatestBuildError()); auto networkMeta = getNetworkMeta(graphHandle); + networkMeta.name = model->get_friendly_name(); + result = _graphDdiTableExt->pfnDestroy(graphHandle); if (ZE_RESULT_SUCCESS != result) { diff --git a/src/plugins/template/backend/ops/ops_evaluates.hpp b/src/plugins/template/backend/ops/ops_evaluates.hpp index 8a44f4f6f2e6cc..8e7d24f82092a5 100644 --- a/src/plugins/template/backend/ops/ops_evaluates.hpp +++ b/src/plugins/template/backend/ops/ops_evaluates.hpp @@ -7,6 +7,7 @@ #include "openvino/op/rms_norm.hpp" #include "ov_ops/augru_cell.hpp" #include "ov_ops/augru_sequence.hpp" +#include "ov_ops/rms.hpp" extern template bool evaluate_node(std::shared_ptr node, ov::TensorVector& outputs, @@ -498,7 +499,7 @@ extern template bool evaluate_node(std::shared_ptr(std::shared_ptr node, +extern template bool evaluate_node(std::shared_ptr node, ov::TensorVector& outputs, const ov::TensorVector& inputs); @@ -516,3 +517,7 @@ extern template bool evaluate_node(std::shared_ptr< extern template bool evaluate_node(std::shared_ptr node, ov::TensorVector& outputs, const ov::TensorVector& inputs); + +extern template bool evaluate_node(std::shared_ptr node, + ov::TensorVector& outputs, + const ov::TensorVector& inputs); diff --git a/src/plugins/template/backend/ops/rms_internal.cpp b/src/plugins/template/backend/ops/rms_internal.cpp new file mode 100644 index 00000000000000..ea8183f843ccd3 --- /dev/null +++ b/src/plugins/template/backend/ops/rms_internal.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "evaluate_node.hpp" +#include "openvino/core/axis_set.hpp" +#include "openvino/core/rank.hpp" +#include "openvino/core/validation_util.hpp" +#include "openvino/op/util/axes_util.hpp" +#include "openvino/reference/rms_norm.hpp" +#include "openvino/runtime/tensor.hpp" +#include "ov_ops/rms.hpp" +#include "utils.hpp" + +using namespace ov; + +template +bool evaluate(const std::shared_ptr& node, + ov::TensorVector& outputs, + const ov::TensorVector& inputs) { + using ET = typename ov::element_type_traits::value_type; + const auto normalized_axes = + ov::util::normalize_axes(node->get_friendly_name(), std::vector{-1}, inputs[0].get_shape().size()); + + outputs[0].set_shape(inputs[0].get_shape()); + + const auto& in_type = inputs[0].get_element_type(); + const auto& out_type = outputs[0].get_element_type(); + + // The type compression mechanism is implemented for F16 only + // The scale is expected to have the same type as the first input + if (in_type != out_type && out_type == ov::element::f16) { + ov::reference::rms_norm_mul_convert_out(inputs[0].data(), + normalized_axes, + outputs[0].data(), + inputs[0].get_shape(), + node->get_epsilon(), + inputs[1].get_shape(), + inputs[1].data()); + + } else { + ov::reference::rms_norm(inputs[0].data(), + normalized_axes, + outputs[0].data(), + inputs[0].get_shape(), + node->get_epsilon(), + inputs[1].get_shape(), + inputs[1].data()); + } + return true; +} + +template <> +bool evaluate_node(std::shared_ptr node, + ov::TensorVector& outputs, + const ov::TensorVector& inputs) { + switch (node->get_input_element_type(0)) { + case element::bf16: + return evaluate(as_type_ptr(node), outputs, inputs); + case element::f16: + return evaluate(as_type_ptr(node), outputs, inputs); + case element::f64: + return evaluate(as_type_ptr(node), outputs, inputs); + case element::f32: + return evaluate(as_type_ptr(node), outputs, inputs); + default: + OPENVINO_THROW("Unhandled data type ", node->get_input_element_type(0).get_type_name(), " in evaluate_node()"); + } +} diff --git a/src/plugins/template/backend/ops/roi_align_rotated.cpp b/src/plugins/template/backend/ops/roi_align_rotated.cpp index ec409a73b072ca..60373931dffb1d 100644 --- a/src/plugins/template/backend/ops/roi_align_rotated.cpp +++ b/src/plugins/template/backend/ops/roi_align_rotated.cpp @@ -8,7 +8,7 @@ #include "openvino/reference/roi_align.hpp" template -bool evaluate(const std::shared_ptr& op, +bool evaluate(const std::shared_ptr& op, ov::TensorVector& outputs, const ov::TensorVector& inputs) { using T = typename ov::element_type_traits::value_type; @@ -33,14 +33,14 @@ bool evaluate(const std::shared_ptr& op, } template <> -bool evaluate_node(std::shared_ptr node, +bool evaluate_node(std::shared_ptr node, ov::TensorVector& outputs, const ov::TensorVector& inputs) { const auto& element_type = node->get_output_element_type(0); #define CASE(type) \ case ov::element::type: \ - return evaluate(ov::as_type_ptr(node), outputs, inputs); + return evaluate(ov::as_type_ptr(node), outputs, inputs); switch (element_type) { CASE(bf16); diff --git a/src/plugins/template/backend/opset_int_tbl.hpp b/src/plugins/template/backend/opset_int_tbl.hpp index 4f1f14f2634867..6174e65f76444c 100644 --- a/src/plugins/template/backend/opset_int_tbl.hpp +++ b/src/plugins/template/backend/opset_int_tbl.hpp @@ -162,7 +162,8 @@ _OPENVINO_OP_REG(Multinomial, ov::op::v13) _OPENVINO_OP_REG(Inverse, ov::op::v14) _OPENVINO_OP_REG(AvgPool, ov::op::v14) _OPENVINO_OP_REG(MaxPool, ov::op::v14) -_OPENVINO_OP_REG(ROIAlignRotated, ov::op::v14) + +_OPENVINO_OP_REG(ROIAlignRotated, ov::op::v15) _OPENVINO_OP_REG(EmbeddingBagOffsets, op::v15) _OPENVINO_OP_REG(EmbeddingBagPacked, op::v15) @@ -170,4 +171,5 @@ _OPENVINO_OP_REG(Col2Im, ov::op::v15) _OPENVINO_OP_REG(AUGRUCell, ov::op::internal) _OPENVINO_OP_REG(AUGRUSequence, ov::op::internal) +_OPENVINO_OP_REG(RMS, ov::op::internal) _OPENVINO_OP_REG(RMSNorm, ov::op::internal) diff --git a/src/plugins/template/tests/functional/op_reference/base_reference_test.cpp b/src/plugins/template/tests/functional/op_reference/base_reference_test.cpp index 239405dbca6b19..f457ab09b90846 100644 --- a/src/plugins/template/tests/functional/op_reference/base_reference_test.cpp +++ b/src/plugins/template/tests/functional/op_reference/base_reference_test.cpp @@ -76,13 +76,7 @@ void CommonReferenceTest::Validate() { ASSERT_EQ(refOutData.size(), actualOutData.size()); for (size_t i = 0; i < refOutData.size(); i++) { - ValidateBlobs(refOutData[i], - actualOutData[i], - i, - threshold, - abs_threshold, - legacy_compare, - actual_comparision_size); + ValidateBlobs(refOutData[i], actualOutData[i], i, threshold, abs_threshold, legacy_compare); } } @@ -91,15 +85,12 @@ void CommonReferenceTest::ValidateBlobs(const ov::Tensor& refBlob, const size_t blob_idx, float threshold, float abs_threshold, - bool legacy_compare, - size_t actual_comparision_size) { + bool legacy_compare) { ASSERT_EQ(refBlob.get_element_type(), outBlob.get_element_type()) << "Incompatible element type for blob with index " << blob_idx; ASSERT_EQ(refBlob.get_byte_size(), outBlob.get_byte_size()) << "Incorrect byte size for blob with index " << blob_idx; - if (actual_comparision_size == 0) - actual_comparision_size = refBlob.get_size(); // compare() get fundamental element type with element_type_traits firstly and cast data to relative ov type with // 'from' types listed below have a fundamental analogue as int8_t, but int8_t is converted only to i8 with from std::vector raw_data_comp_only = @@ -133,6 +124,8 @@ void CommonReferenceTest::ValidateBlobs(const ov::Tensor& refBlob, } return; } + + const auto actual_comparision_size = refBlob.get_size(); switch (element_type) { case ov::element::bf16: ov::test::utils::compare_raw_data(refBlob.data(), diff --git a/src/plugins/template/tests/functional/op_reference/base_reference_test.hpp b/src/plugins/template/tests/functional/op_reference/base_reference_test.hpp index 94923ca58cb8ef..a228c908850b42 100644 --- a/src/plugins/template/tests/functional/op_reference/base_reference_test.hpp +++ b/src/plugins/template/tests/functional/op_reference/base_reference_test.hpp @@ -28,8 +28,7 @@ class CommonReferenceTest { const size_t blob_idx, float threshold, float abs_threshold, - bool legacy_compare, - size_t actual_comparision_size); + bool legacy_compare); protected: bool legacy_compare = false; @@ -42,9 +41,8 @@ class CommonReferenceTest { std::vector inputData; std::vector refOutData; std::vector actualOutData; - float threshold = 1e-2f; // Relative diff - float abs_threshold = -1.f; // Absolute diff (not used when negative) - size_t actual_comparision_size = 0; // For ref output data is smaller than output blob size + float threshold = 1e-2f; // Relative diff + float abs_threshold = -1.f; // Absolute diff (not used when negative) }; template diff --git a/src/plugins/template/tests/functional/op_reference/experimental_detectron_detection_prior_grid.cpp b/src/plugins/template/tests/functional/op_reference/experimental_detectron_detection_prior_grid.cpp index 5450c11a74a41e..58d577cb6a06aa 100644 --- a/src/plugins/template/tests/functional/op_reference/experimental_detectron_detection_prior_grid.cpp +++ b/src/plugins/template/tests/functional/op_reference/experimental_detectron_detection_prior_grid.cpp @@ -7,9 +7,10 @@ #include "base_reference_test.hpp" #include "openvino/op/experimental_detectron_prior_grid_generator.hpp" -using namespace reference_tests; using namespace ov; +using reference_tests::CommonReferenceTest; +using reference_tests::CreateTensor; using Attrs = op::v6::ExperimentalDetectronPriorGridGenerator::Attributes; namespace { @@ -30,7 +31,6 @@ struct ExperimentalPGGParams { imageSizeInfoShape(imageSizeInfoShape), outRefShape(outRefShape), inType(iType), - outType(iType), priorsData(CreateTensor(iType, priorsValues)), refData(CreateTensor(outRefShape, iType, refValues)), testcaseName(testcaseName) { @@ -54,12 +54,11 @@ struct ExperimentalPGGParams { PartialShape imageSizeInfoShape; Shape outRefShape; size_t actualComparisonSize; - ov::element::Type inType; - ov::element::Type outType; - ov::Tensor priorsData; - ov::Tensor featureMapData; - ov::Tensor imageSizeInfoData; - ov::Tensor refData; + element::Type inType; + Tensor priorsData; + Tensor featureMapData; + Tensor imageSizeInfoData; + Tensor refData; std::string testcaseName; }; @@ -67,23 +66,18 @@ class ReferenceExperimentalPGGLayerTest : public testing::TestWithParam 0) - actual_comparision_size = params.actualComparisonSize; } static std::string getTestCaseName(const testing::TestParamInfo& obj) { - auto param = obj.param; + const auto& param = obj.param; std::ostringstream result; result << "priorsShape=" << param.priorsShape << "_"; result << "featureMapShape=" << param.featureMapShape << "_"; result << "imageSizeInfoShape=" << param.imageSizeInfoShape << "_"; result << "iType=" << param.inType << "_"; - result << "oType=" << param.outType << "_"; result << "flatten=" << param.attrs.flatten << "_"; result << "h=" << param.attrs.h << "_"; result << "w=" << param.attrs.w << "_"; @@ -94,6 +88,26 @@ class ReferenceExperimentalPGGLayerTest : public testing::TestWithParam CreateFunction(const ExperimentalPGGParams& params) { const auto priors = std::make_shared(params.inType, params.priorsShape); @@ -103,7 +117,7 @@ class ReferenceExperimentalPGGLayerTest : public testing::TestWithParam(NodeVector{ExperimentalPGG}, ParameterVector{priors, featureMap, im_info}); + return std::make_shared(NodeVector{ExperimentalPGG}, ParameterVector{priors, featureMap, im_info}); } }; diff --git a/src/plugins/template/tests/functional/op_reference/memory.cpp b/src/plugins/template/tests/functional/op_reference/memory.cpp index 59ad59106eba66..ef16fae9f73e10 100644 --- a/src/plugins/template/tests/functional/op_reference/memory.cpp +++ b/src/plugins/template/tests/functional/op_reference/memory.cpp @@ -302,8 +302,7 @@ class ReferenceMemoryTest : public testing::TestWithParam { i, 1e-2f, -1.f, - true, - 0); + true); } } diff --git a/src/plugins/template/tests/functional/op_reference/rms_internal.cpp b/src/plugins/template/tests/functional/op_reference/rms_internal.cpp new file mode 100644 index 00000000000000..433d2e710d2a2d --- /dev/null +++ b/src/plugins/template/tests/functional/op_reference/rms_internal.cpp @@ -0,0 +1,437 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "base_reference_test.hpp" +#include "common_test_utils/common_utils.hpp" +#include "openvino/op/constant.hpp" +#include "ov_ops/rms.hpp" + +using namespace ov; +using namespace reference_tests; + +struct RMSParams { + RMSParams(const reference_tests::Tensor& paramInput, + const reference_tests::Tensor& paramReductionAxes, + const double eps, + const reference_tests::Tensor& paramExpected, + const reference_tests::Tensor& paramScale = {}) + : input(paramInput), + reductionAxes(paramReductionAxes), + eps(eps), + expected(paramExpected) { + if (paramScale.data) { + scale = paramScale; + } + } + reference_tests::Tensor input; + reference_tests::Tensor scale; + // Warning: Axes input is not currently supported by internal::RMS, it's always assumed to be "-1" + reference_tests::Tensor reductionAxes; + double eps; + reference_tests::Tensor expected; +}; + +class ReferenceRMSLayerTest : public testing::TestWithParam, public CommonReferenceTest { +public: + void SetUp() override { + auto params = GetParam(); + const auto output_type = + params.expected.type == params.input.type ? ov::element::undefined : params.expected.type; + function = CreateFunction(params.input, params.eps, params.scale, output_type); + if (!params.scale.data) { + inputData = {params.input.data}; + } else { + inputData = {params.input.data, params.scale.data}; + } + refOutData = {params.expected.data}; + if (params.input.type == ov::element::f32) { + threshold = 1e-5f; // Set more precise threshold to detect eps changes + } + } + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + auto param = obj.param; + std::ostringstream result; + result << "shape=" << param.input.shape; + result << "_iType=" << param.input.type; + result << "_oType=" << param.expected.type; + result << "_axesType=" << param.reductionAxes.type; + result << "_reductionAxes=" + << ov::test::utils::vec2str(op::v0::Constant(param.reductionAxes.data).cast_vector()); + if (param.scale.data) { + result << "_scaleShape=" << param.scale.shape; + } + result << "_eps=" << param.eps; + return result.str(); + } + +private: + static std::shared_ptr CreateFunction(const reference_tests::Tensor& input, + const double eps, + const reference_tests::Tensor& scale, + const ov::element::Type& output_type) { + const auto in = std::make_shared(input.type, input.shape); + + if (!scale.data) { + const auto scale_const = std::make_shared(input.type, input.shape, 1.0); + const auto rms_norm = std::make_shared(in, scale_const, eps, output_type); + return std::make_shared(NodeVector{rms_norm}, ParameterVector{in}); + } + const auto scale_param = std::make_shared(scale.type, scale.shape); + const auto rms_norm = std::make_shared(in, scale_param, eps, output_type); + return std::make_shared(NodeVector{rms_norm}, ParameterVector{in, scale_param}); + } +}; + +TEST_P(ReferenceRMSLayerTest, CompareWithHardcodedRefs) { + Exec(); +} + +INSTANTIATE_TEST_SUITE_P( + smoke_RMSInternal_With_Hardcoded_Refs, + ReferenceRMSLayerTest, + ::testing::Values( + RMSParams(reference_tests::Tensor{Shape{8}, + ov::element::f32, + std::vector({-6.44250308, + -59.65135475, + 28.08134504, + -3.38603289, + 1.047344, + -22.62146978, + 58.72749089, + 16.00083578})}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{Shape{8}, + ov::element::f32, + std::vector{-0.19629386, + -1.81749151, + 0.85559844, + -0.10316758, + 0.03191107, + -0.68924385, + 1.7893427, + 0.48752259}}), + RMSParams(reference_tests::Tensor{Shape{8}, + ov::element::f32, + std::vector({-6.44250308, + -59.65135475, + 28.08134504, + -3.38603289, + 1.047344, + -22.62146978, + 58.72749089, + 16.00083578})}, + reference_tests::Tensor{Shape{1}, ov::element::i32, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{Shape{8}, + ov::element::f32, + std::vector{-0.19629386, + -1.81749151, + 0.85559844, + -0.10316758, + 0.03191107, + -0.68924385, + 1.7893427, + 0.48752259}}), + RMSParams(reference_tests::Tensor{Shape{8}, + ov::element::f32, + std::vector({-6.44250308, + -59.65135475, + 28.08134504, + -3.38603289, + 1.047344, + -22.62146978, + 58.72749089, + 16.00083578})}, + reference_tests::Tensor{Shape{1}, ov::element::i32, std::vector({-1})}, + 1e-2, + reference_tests::Tensor{Shape{8}, + ov::element::f32, + std::vector{-0.19629295, + -1.81748319, + 0.85559446, + -0.10316710, + 0.03191093, + -0.68924063, + 1.78933442, + 0.48752034}}), + RMSParams(reference_tests::Tensor{Shape{8}, + ov::element::f32, + std::vector({-6.44250308, + -59.65135475, + 28.08134504, + -3.38603289, + 1.047344, + -22.62146978, + 58.72749089, + 16.00083578})}, + reference_tests::Tensor{Shape{1}, ov::element::i32, std::vector({-1})}, + 5.55, + reference_tests::Tensor{Shape{8}, + ov::element::f32, + std::vector{-0.19579013, + -1.81282747, + 0.85340279, + -0.10290283, + 0.03182918, + -0.68747509, + 1.78475082, + 0.48627150}}), + RMSParams( + reference_tests::Tensor{ + Shape{2, 3}, + ov::element::f32, + std::vector({-6.44250308, -59.65135475, 28.08134504, -3.38603289, 1.047344, -22.62146978})}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 3}, + ov::element::f32, + std::vector{-0.16844749, -1.559661, 0.7342227, -0.25613253, 0.07922512, -1.71117484}}), + + RMSParams(reference_tests::Tensor{Shape{2, 3, 1}, + ov::element::f32, + std::vector( + {-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344, -2.262147})}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 3, 1}, + ov::element::f32, + std::vector{-0.99998795, -0.99999986, 0.99999937, -0.99995639, 0.99954449, -0.99999902}}), + + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::f32, + std::vector({-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344, + -2.262147, 5.872749, 1.6000836, -6.754028, 4.015047, + 9.291021, 0.00016722, 7.7904015, -3.167727, 1.3428825, + -1.4490807, -1.2650547, 5.5311837, 0.71208346, 9.074844, + 0.8841632, -8.358102, -2.673152, 7.01701})}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 2, 2, 3}, + ov::element::f32, + std::vector{-0.16844743, -1.5596604, 0.7342224, -0.2561318, 0.0792249, -1.71117, + 1.1187618, 0.30481678, -1.2866459, 0.687082, 1.5899425, 0.00002862, + 1.5844078, -0.6442507, 0.27311474, -0.4285907, -0.3741618, 1.6359433, + 0.1348591, 1.7186543, 0.1674487, -1.288446, -0.41208065, 1.0817096}}), + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::bf16, + std::vector{ + -0.6445, -5.9688, 2.8125, -0.3379, 0.1045, -2.2656, 5.8750, 1.6016, + -6.7500, 4.0000, 9.3125, 0.0002, 7.7812, -3.1719, 1.3438, -1.4453, + -1.2656, 5.5312, 0.7109, 9.0625, 0.8828, -8.3750, -2.6719, 7.0312}}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 2, 2, 3}, + ov::element::bf16, + std::vector{-0.1680, -1.5625, 0.7344, -0.2559, 0.0791, -1.7188, 1.1172, 0.3047, + -1.2891, 0.6836, 1.5938, 0.0000, 1.5859, -0.6484, 0.2734, -0.4277, + -0.3750, 1.6406, 0.1348, 1.7188, 0.1670, -1.2891, -0.4102, 1.0781}}), + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::bf16, + std::vector{ + -0.6445, -5.9688, 2.8125, -0.3379, 0.1045, -2.2656, 5.8750, 1.6016, + -6.7500, 4.0000, 9.3125, 0.0002, 7.7812, -3.1719, 1.3438, -1.4453, + -1.2656, 5.5312, 0.7109, 9.0625, 0.8828, -8.3750, -2.6719, 7.0312}}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 2, 2, 3}, + ov::element::bf16, + std::vector{-0.0840, -0.7812, 0.3672, -0.1279, 0.0396, -0.8594, 0.5586, 0.1523, + -0.6445, 0.3418, 0.7969, 0.0000, 0.7930, -0.3242, 0.1367, -0.2139, + -0.1875, 0.8203, 0.0674, 0.8594, 0.0835, -0.6445, -0.2051, 0.5391}}, + reference_tests::Tensor{Shape{1}, ov::element::bf16, std::vector{0.5}}), + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::f16, + std::vector{-0.644, -5.965, 2.809, -0.3386, 0.10474, -2.262, + 5.87, 1.6, -6.754, 4.016, 9.29, 0.0001673, + 7.79, -3.168, 1.343, -1.449, -1.265, 5.53, + 0.712, 9.08, 0.8843, -8.36, -2.674, 7.016}}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 2, 2, 3}, + ov::element::f16, + std::vector{-0.1683, -1.559, 0.734, -0.256, 0.0792, -1.711, 1.118, 0.3047, + -1.286, 0.687, 1.59, 0.0000286, 1.584, -0.644, 0.273, -0.4287, + -0.374, 1.636, 0.1348, 1.719, 0.1675, -1.288, -0.412, 1.081}}), + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::f16, + std::vector{-0.644, -5.965, 2.809, -0.3386, 0.10474, -2.262, + 5.87, 1.6, -6.754, 4.016, 9.29, 0.0001673, + 7.79, -3.168, 1.343, -1.449, -1.265, 5.53, + 0.712, 9.08, 0.8843, -8.36, -2.674, 7.016}}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::f16, + std::vector{ + + -0.08417, -0.7793, 0.367, -0.128, 0.0396, -0.8555, 0.559, 0.1523, + -0.643, 0.3435, 0.795, 0.0000143, 0.792, -0.322, 0.1365, -0.2144, + -0.187, 0.818, 0.0674, 0.8594, 0.08374, -0.644, -0.206, 0.5405}}, + reference_tests::Tensor{Shape{1}, ov::element::f16, std::vector{0.5}}), + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::f64, + std::vector{-0.64425031, -5.96513547, 2.8081345, -0.33860329, + 0.1047344, -2.26214698, 5.87274909, 1.60008358, + -6.75402803, 4.01504693, 9.2910216, 0.00016722, + 7.79040128, -3.16772695, 1.34288255, -1.44908073, + -1.26505474, 5.5311837, 0.71208347, 9.07484454, + 0.8841632, -8.35810155, -2.67315197, 7.01701008}}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 2, 2, 3}, + ov::element::f64, + std::vector{-0.16844743, -1.55966048, 0.73422245, -0.2561318, 0.0792249, -1.71116999, + 1.1187618, 0.30481677, -1.2866459, 0.68708204, 1.58994258, 0.00002862, + 1.58440782, -0.64425068, 0.27311477, -0.42859069, -0.37416182, 1.63594325, + 0.1348591, 1.71865438, 0.1674487, -1.28844602, -0.41208066, 1.0817096}}), + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::f64, + std::vector{-0.64425031, -5.96513547, 2.8081345, -0.33860329, + 0.1047344, -2.26214698, 5.87274909, 1.60008358, + -6.75402803, 4.01504693, 9.2910216, 0.00016722, + 7.79040128, -3.16772695, 1.34288255, -1.44908073, + -1.26505474, 5.5311837, 0.71208347, 9.07484454, + 0.8841632, -8.35810155, -2.67315197, 7.01701008}}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 2, 2, 3}, + ov::element::f64, + std::vector{-0.08422372, -0.77983024, 0.36711123, -0.1280659, 0.03961245, -0.855585, + 0.5593809, 0.15240838, -0.64332295, 0.34354102, 0.79497129, 0.00001431, + 0.79220391, -0.32212534, 0.13655738, -0.21429535, -0.18708091, 0.81797163, + 0.06742955, 0.85932719, 0.08372435, -0.64422301, -0.20604033, 0.5408548}}, + reference_tests::Tensor{Shape{1}, ov::element::f64, std::vector{0.5}}), + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::f32, + std::vector({-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344, + -2.262147, 5.872749, 1.6000836, -6.754028, 4.015047, + 9.291021, 0.00016722, 7.7904015, -3.167727, 1.3428825, + -1.4490807, -1.2650547, 5.5311837, 0.71208346, 9.074844, + 0.8841632, -8.358102, -2.673152, 7.01701})}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 2, 2, 3}, + ov::element::f32, + std::vector{-0.08422372, -0.77983022, 0.36711121, -0.1280659, 0.03961245, -0.85558498, + 0.55938089, 0.15240839, -0.64332294, 0.343541, 0.79497123, 0.00001431, + 0.7922039, -0.32212535, 0.13655737, -0.21429534, -0.1870809, 0.81797165, + 0.06742955, 0.85932714, 0.08372435, -0.64422297, -0.20604032, 0.54085481}}, + reference_tests::Tensor{Shape{1}, ov::element::f32, std::vector{0.5}}), + + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::f32, + std::vector({-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344, + -2.262147, 5.872749, 1.6000836, -6.754028, 4.015047, + 9.291021, 0.00016722, 7.7904015, -3.167727, 1.3428825, + -1.4490807, -1.2650547, 5.5311837, 0.71208346, 9.074844, + 0.8841632, -8.358102, -2.673152, 7.01701})}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 2, 2, 3}, + ov::element::f32, + std::vector{-0.08422372, -2.33949065, 0.1835556, -0.1280659, 0.11883735, -0.42779249, + 0.55938089, 0.45722517, -0.32166147, 0.343541, 2.38491368, 0.00000715, + 0.7922039, -0.96637604, 0.06827869, -0.21429534, -0.56124271, 0.40898582, + 0.06742955, 2.57798141, 0.04186217, -0.64422297, -0.61812097, 0.27042741}}, + reference_tests::Tensor{Shape{1, 3}, ov::element::f32, std::vector{0.5, 1.5, 0.25}}), + RMSParams(reference_tests::Tensor{Shape{2, 2, 2, 3}, + ov::element::f32, + std::vector({-0.64425033, -5.9651356, 2.8081346, -0.3386033, 0.1047344, + -2.262147, 5.872749, 1.6000836, -6.754028, 4.015047, + 9.291021, 0.00016722, 7.7904015, -3.167727, 1.3428825, + -1.4490807, -1.2650547, 5.5311837, 0.71208346, 9.074844, + 0.8841632, -8.358102, -2.673152, 7.01701})}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{2, 2, 2, 3}, + ov::element::f16, + std::vector{-0.08422372, -2.33949065, 0.1835556, -0.1280659, 0.11883735, + -0.42779249, 0.55938089, 0.45722517, -0.32166147, 0.343541, + 2.38491368, 0.00000715, 0.7922039, -0.96637604, 0.06827869, + -0.21429534, -0.56124271, 0.40898582, 0.06742955, 2.57798141, + 0.04186217, -0.64422297, -0.61812097, 0.27042741}}, + reference_tests::Tensor{Shape{1, 3}, ov::element::f32, std::vector{0.5, 1.5, 0.25}}), + RMSParams(reference_tests::Tensor{Shape{1, 3, 3, 3}, + ov::element::f32, + std::vector({1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9})}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-5, + reference_tests::Tensor{ + Shape{1, 3, 3, 3}, + ov::element::f32, + std::vector{0.46290955, 0.92581911, 1.38872866, 0.78954188, 0.98692735, 1.18431282, + 0.87047794, 0.99483193, 1.11918592, 0.46290955, 0.92581911, 1.38872866, + 0.78954188, 0.98692735, 1.18431282, 0.87047794, 0.99483193, 1.11918592, + 0.46290955, 0.92581911, 1.38872866, 0.78954188, 0.98692735, 1.18431282, + 0.87047794, 0.99483193, 1.11918592}}), + RMSParams(reference_tests::Tensor{Shape{2, 3, 4}, + ov::element::f16, + std::vector({-64.44, -596.5, 280.8, -33.88, 10.48, -226.2, + 587.5, 160., -675.5, 401.5, 929., 0.01672, + 779., -316.8, 134.2, -144.9, -126.5, 553., + 71.2, 907.5, 88.44, -836., -267.2, 701.5})}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-1, + reference_tests::Tensor{ + Shape{2, 3, 4}, + ov::element::f16, + // Expected overwlow due to f16 accumulation + // (that's why the conversion to fp32 is needed, tested below) + std::vector{-0., -0., 0., -0., 0., -0., 0., 0., -0., 0., 0., 0., + 0., -0., 0., -0., -0., 0., 0., 0., 0., -0., -0., 0.}}, + reference_tests::Tensor{Shape{1}, ov::element::f16, std::vector{1.0}}), + RMSParams(reference_tests::Tensor{Shape{2, 3, 4}, + ov::element::f32, + std::vector{-64.44, -596.5, 280.8, -33.88, 10.48, -226.2, + 587.5, 160., -675.5, 401.5, 929., 0.01672, + 779., -316.8, 134.2, -144.9, -126.5, 553., + 71.2, 907.5, 88.44, -836., -267.2, 701.5}}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-1, + reference_tests::Tensor{ + Shape{2, 3, 4}, + ov::element::f16, + std::vector{-0.19433594, -1.79882812, 0.84667969, -0.10217285, 0.03225708, + -0.69628906, 1.80859375, 0.49267578, -1.11035156, 0.66015625, + 1.52734375, 0.00002748, 1.80371094, -0.73339844, 0.31079102, + -0.33544922, -0.23583984, 1.03125000, 0.13269043, 1.69238281, + 0.15698242, -1.48339844, -0.47436523, 1.24511719}}, + reference_tests::Tensor{Shape{1}, ov::element::f32, std::vector{1.0}}), + RMSParams(reference_tests::Tensor{Shape{2, 3, 4}, + ov::element::f32, + std::vector{ + -64.4375000000, -596.5000000000, 280.7500000000, -33.8750000000, + 10.4765625000, -226.2500000000, 587.5000000000, 160.0000000000, + -675.5000000000, 401.5000000000, 929.0000000000, 0.0167236328, + 779.0000000000, -316.7500000000, 134.2500000000, -144.8750000000, + -126.5000000000, 553.0000000000, 71.1875000000, 907.5000000000, + 88.4375000000, -836.0000000000, -267.2500000000, 701.5000000000}}, + reference_tests::Tensor{Shape{1}, ov::element::i64, std::vector({-1})}, + 1e-1, + reference_tests::Tensor{ + Shape{2, 3, 4}, + ov::element::f16, + std::vector{-0.0971679688, -2.6972656250, 0.2116699219, -0.2043457031, 0.0161285400, + -1.0449218750, 0.4521484375, 0.9853515625, -0.5551757812, 0.9897460938, + 0.3818359375, 0.0000549555, 0.9018554688, -1.0996093750, 0.0776977539, + -0.6708984375, -0.1179199219, 1.5468750000, 0.0331726074, 3.3847656250, + 0.0784912109, -2.2246093750, -0.1185913086, 2.4902343750}}, + reference_tests::Tensor{Shape{4}, ov::element::f32, std::vector{0.5, 1.5, 0.25, 2.0}})), + ReferenceRMSLayerTest::getTestCaseName); diff --git a/src/plugins/template/tests/functional/op_reference/roi_align_rotated.cpp b/src/plugins/template/tests/functional/op_reference/roi_align_rotated.cpp index 12ba487829b987..40fecda965ca2b 100644 --- a/src/plugins/template/tests/functional/op_reference/roi_align_rotated.cpp +++ b/src/plugins/template/tests/functional/op_reference/roi_align_rotated.cpp @@ -102,7 +102,7 @@ class ReferenceROIAlignRotatedTest : public testing::TestWithParam(params.roiBatchIdxs.type, params.roiBatchIdxs.shape, params.roiBatchIdxs.data.data()); - const auto roi_align_rot = std::make_shared(featureMap, + const auto roi_align_rot = std::make_shared(featureMap, coords, roisIdx, params.pooledH, diff --git a/src/tests/functional/plugin/conformance/subgraphs_dumper/README.md b/src/tests/functional/plugin/conformance/subgraphs_dumper/README.md index 3f6da9065419b5..b382f14a25c54e 100644 --- a/src/tests/functional/plugin/conformance/subgraphs_dumper/README.md +++ b/src/tests/functional/plugin/conformance/subgraphs_dumper/README.md @@ -1,7 +1,7 @@ # Subgraphs Dumper Tool The tool is designed to analyse any arbitrary scope of the models in a formats supported by OpenVINO frontends -to extract and serialize unique operations and patterns from the input models. The criteria for +to extract and serialize unique operations and patterns from the input models. The criteria for uniqueness and matching are defined by implementation of twon interface classes: * `Matcher` defines the rules for dumping operatons to the cache. * `Extractor` defines the rules for extracting subgraphs from the models. @@ -16,11 +16,11 @@ uniqueness and matching are defined by implementation of twon interface classes: * `read value & assign` extracts stateful graphs. > NOTE: -> Please check the following architecture [diagram](./../../../../../../docs/sphinx_setup/_static/images/img/subgraphs_dumper_arch_diaram.png) to get detailed information. +> Please check the following architecture [diagram](../../../../../../docs/articles_en/assets/images/subgraphs_dumper_arch_diaram.png) to get detailed information. ## Build -To build the tool, run the following commands: +To build the tool, run the following commands: ``` cmake -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_TESTS=ON . make --jobs=$(nproc --all) ov_subgraphs_dumper @@ -28,7 +28,7 @@ make --jobs=$(nproc --all) ov_subgraphs_dumper The outcome of a build is a `ov_subgraphs_dumper` binary located in the building artifacts folder. ## Run -The tool takes only one required command-line parameter: +The tool takes only one required command-line parameter: * `--input_folders` - Required. Comma separated paths to the input folders with models in Intermediate Representation format (IRs). The separator is `,`. * `--output_folder` - Optinal. Path to the output folders where the IRs will be serialized. Default value is "output". * `--local_cache` - Optional. Comma-separated paths to the local cache folders with IRs. The separator is `,`. @@ -36,7 +36,7 @@ The tool takes only one required command-line parameter: * `--extract_body` - Optional. Allows extracting operation bodies to the operation cache. * `--cache_type` - Optional. Allows extracting Operations, Subgraphs, or both types. The default value is `OP` and `GRAPH`. -Example running command: +Example running command: ```ov_subgraphs_dumper --input_folders /dir_0/to/models,/dir_1/to/models --output_folder /path/to/dir``` ## Extraction Algorithm @@ -55,4 +55,4 @@ make ov_subgraphs_dumper_tests ``` ## Architecture Diagram -![SubgraphsDumper Architecture Diagram](./../../../../../../docs/sphinx_setup/_static/images/img/subgraphs_dumper_arch_diaram.png) +![SubgraphsDumper Architecture Diagram](../../../../../../docs/articles_en/assets/images/subgraphs_dumper_arch_diaram.png) diff --git a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp index f9a4a1f70e017c..98c3d234914be8 100644 --- a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp +++ b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp @@ -961,7 +961,7 @@ std::shared_ptr generate(const std::shared_ptr& return std::make_shared(results, params, "ROIAlignGraph"); } -std::shared_ptr generate(const std::shared_ptr& node) { +std::shared_ptr generate(const std::shared_ptr& node) { ov::ParameterVector params{std::make_shared(ov::element::f32, ov::Shape{{1, 1, 16, 16}})}; const auto coords = std::make_shared( ov::element::f32, @@ -969,7 +969,7 @@ std::shared_ptr generate(const std::shared_ptr(node->get_rois_input_second_dim_size(), 0)); const auto roisIdx = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{0}); - auto new_node = std::make_shared(params.at(0), coords, roisIdx, 2, 2, 2, 1, true); + auto new_node = std::make_shared(params.at(0), coords, roisIdx, 2, 2, 2, 1, true); ov::ResultVector results{std::make_shared(new_node)}; return std::make_shared(results, params, "ROIAlignRotatedGraph"); } diff --git a/src/tests/functional/shared_test_classes/src/subgraph/weights_decompression_builders.cpp b/src/tests/functional/shared_test_classes/src/subgraph/weights_decompression_builders.cpp index 6f86e420b1a71e..38e45065c43b47 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/weights_decompression_builders.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/weights_decompression_builders.cpp @@ -200,9 +200,11 @@ std::shared_ptr initGatherDecompressionSubgraph(const ov::Shape& data_ original_data_shape[data_idx] = data_shape[1] / group_size; original_data_shape.insert(original_data_shape.begin() + data_idx + 1, group_size); } - ov::test::utils::InputGenerateData generate_data; + + const auto up_to = data_precision == ov::element::i4 ? 7 : 15; + ov::test::utils::InputGenerateData generate_data(0, up_to); if (data_precision.is_signed()) - generate_data.start_from = -5; + generate_data.start_from = -1; auto weights_tensor = ov::test::utils::create_and_fill_tensor(data_precision, original_data_shape, generate_data); auto weights = std::make_shared(weights_tensor); weights->set_friendly_name("Compressed_weights"); @@ -226,7 +228,7 @@ std::shared_ptr initGatherDecompressionSubgraph(const ov::Shape& data_ scaleshift_const_shape.end()); if (add_subtract) { auto shift_tensor_shape = per_tensor_zp ? ov::Shape{1} : scaleshift_const_shape; - auto shift_tensor = ov::test::utils::create_and_fill_tensor(data_precision, shift_tensor_shape); + auto shift_tensor = ov::test::utils::create_and_fill_tensor(data_precision, shift_tensor_shape, ov::test::utils::InputGenerateData(0, up_to)); if (per_tensor_zp && data_precision.bitwidth() == 4) { static_cast(shift_tensor.data())[0] = 0x88; } diff --git a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py index 6662bb78ca0cdd..e6291ef566eaf8 100644 --- a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py +++ b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py @@ -98,7 +98,7 @@ def numpy_to_torch_recursively(x): ov_inputs = flattenize_inputs(inputs) if self.use_torch_compile_backend(): - self.torch_compile_backend_test(model, torch_inputs, custom_eps) + self.torch_compile_backend_test(model, torch_inputs, custom_eps, **kwargs) else: if self.use_torch_export(): from openvino import convert_model @@ -262,7 +262,7 @@ def _resolve_input_shape_dtype(self, om, ov_inputs, dynamic_shapes): om.validate_nodes_and_infer_types() return om - def torch_compile_backend_test(self, model, inputs, custom_eps): + def torch_compile_backend_test(self, model, inputs, custom_eps, **kwargs): torch._dynamo.reset() with torch.no_grad(): model.eval() @@ -271,8 +271,15 @@ def torch_compile_backend_test(self, model, inputs, custom_eps): torch._dynamo.reset() with torch.no_grad(): model.eval() + options={"testing": 1,} + if ("aot_autograd" in kwargs): + options.update({"aot_autograd": True,}) + dynamic = False + if ("dynamic" in kwargs): + dynamic = kwargs["dynamic"] + ov_model = torch.compile( - model, backend="openvino", options={"testing": 1}) + model, backend="openvino", dynamic=dynamic, options=options) ov_res = ov_model(*inputs) if not isinstance(fw_res, (tuple)): diff --git a/tests/layer_tests/pytorch_tests/test_expand.py b/tests/layer_tests/pytorch_tests/test_expand.py index 659fa70d17a5f7..4ef275ae0d4bfd 100644 --- a/tests/layer_tests/pytorch_tests/test_expand.py +++ b/tests/layer_tests/pytorch_tests/test_expand.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +import random from pytorch_layer_test_class import PytorchLayerTest @@ -36,6 +37,7 @@ def forward_broadcast(self, x): @pytest.mark.nightly @pytest.mark.precommit @pytest.mark.precommit_torch_export + @pytest.mark.precommit_fx_backend def test_expand(self, dims, op_type, ie_device, precision, ir_version): self._test(*self.create_model(dims, op_type), ie_device, precision, ir_version) @@ -70,6 +72,7 @@ def forward_broadcast(self, x, y): @pytest.mark.nightly @pytest.mark.precommit @pytest.mark.precommit_torch_export + @pytest.mark.precommit_fx_backend def test_expand(self, dims, op_type, ie_device, precision, ir_version): self._test(*self.create_model(op_type), ie_device, precision, ir_version, kwargs_to_prepare_input={"broadcast_shape": dims}) @@ -110,3 +113,29 @@ def forward(self, x, y): def test_expand(self, ie_device, precision, ir_version, kwargs_to_prepare_input): self._test(*self.create_model(), ie_device, precision, ir_version, kwargs_to_prepare_input=kwargs_to_prepare_input) + +class TestDynamicExpand(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + last_dym = random.randint(1,4) + return (np.random.randn(1, 3, 1).astype(np.float32), last_dym) + + def create_model(self, dim): + import torch + + class aten_expand(torch.nn.Module): + def __init__(self, dims): + super(aten_expand, self).__init__() + self.dims = dims + + def forward(self, x, dym): + return x.expand((self.dims+(dym,))) + + ref_net = None + + return aten_expand(dim), ref_net, f"aten::expand" + + @pytest.mark.parametrize("dims", [(4, 3), (-1, -1)]) + @pytest.mark.precommit_fx_backend + def test_dynamic_expand(self, dims, ie_device, precision, ir_version): + self._test(*self.create_model(dims), ie_device, precision, ir_version, dynamic=True, aot_autograd=True) diff --git a/tests/layer_tests/pytorch_tests/test_reshape.py b/tests/layer_tests/pytorch_tests/test_reshape.py index 8cddf05aab0211..0498d410600b27 100644 --- a/tests/layer_tests/pytorch_tests/test_reshape.py +++ b/tests/layer_tests/pytorch_tests/test_reshape.py @@ -3,13 +3,14 @@ import numpy as np import pytest +import random from pytorch_layer_test_class import PytorchLayerTest class TestReshape(PytorchLayerTest): def _prepare_input(self): - return (np.random.uniform(0, 50, (1, 12, 12, 24)).astype(np.float32),) + return (np.random.uniform(0, 50, (1, 12, 12, 24)).astype(np.float32)) def create_model(self, shape): import torch @@ -39,5 +40,37 @@ def forward(self, x): @pytest.mark.nightly @pytest.mark.precommit @pytest.mark.precommit_torch_export + @pytest.mark.precommit_fx_backend def test_reshape(self, shape, ie_device, precision, ir_version): self._test(*self.create_model(shape), ie_device, precision, ir_version) + +class TestDynamicReshape(PytorchLayerTest): + def _prepare_input(self): + last_dym = random.randint(1,2) + return (np.random.uniform(0, 50, (1, 12, 12, 24)).astype(np.float32), last_dym) + + def create_model(self, shape): + import torch + + class aten_reshape(torch.nn.Module): + def __init__(self, shape): + super(aten_reshape, self).__init__() + self.shape = shape + + def forward(self, x, dym): + #return torch.reshape(x, self.shape) + dym2 = int(torch.ops.aten.sym_size(x, 3)/dym) + return torch.reshape(x, [12, 12, dym2, dym]) + + ref_net = None + + return aten_reshape(shape), ref_net, "aten::reshape" + + @pytest.mark.parametrize(("shape"), [ + [12, 12, 24, 1], + [12, 12, 12, 2], + [24, 12, 12, 1], + ]) + @pytest.mark.precommit_fx_backend + def test_dynamic_reshape(self, shape, ie_device, precision, ir_version): + self._test(*self.create_model(shape), ie_device, precision, ir_version, aot_autograd=True, dynamic=True)