diff --git a/.github/dockerfiles/docker_tag b/.github/dockerfiles/docker_tag index 8e5386a30ec997..56faa37d1da67f 100644 --- a/.github/dockerfiles/docker_tag +++ b/.github/dockerfiles/docker_tag @@ -1 +1 @@ -pr-26656 \ No newline at end of file +pr-26993 \ No newline at end of file diff --git a/.github/dockerfiles/ov_build/ubuntu_20_04_arm64/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_20_04_arm64/Dockerfile index c7d0e95164f414..7653fe6abb7434 100644 --- a/.github/dockerfiles/ov_build/ubuntu_20_04_arm64/Dockerfile +++ b/.github/dockerfiles/ov_build/ubuntu_20_04_arm64/Dockerfile @@ -35,7 +35,7 @@ RUN apt-get update && \ libhdf5-dev \ # For Java API default-jdk \ - # Compiler + # Compiler, required for multi-isa build gcc-10 \ g++-10 \ && \ diff --git a/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile index 53829ad50b2975..1620e674ef67d5 100644 --- a/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile +++ b/.github/dockerfiles/ov_build/ubuntu_20_04_x64/Dockerfile @@ -30,9 +30,6 @@ RUN apt-get update && \ python3.9-distutils \ # For Java API default-jdk \ - # Compiler \ - gcc-10 \ - g++-10 \ && \ rm -rf /var/lib/apt/lists/* @@ -42,10 +39,6 @@ RUN chmod +x /install_build_dependencies.sh && \ /install_build_dependencies.sh && \ rm -rf /var/lib/apt/lists/* -# Set gcc-10 as a default compiler -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30 - # Install sscache ARG SCCACHE_VERSION="v0.7.5" ENV SCCACHE_HOME="/opt/sccache" \ diff --git a/.github/dockerfiles/ov_build/ubuntu_20_04_x64_nvidia/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_20_04_x64_nvidia/Dockerfile index 5df369bbb6398a..0a4d7ef90aa115 100644 --- a/.github/dockerfiles/ov_build/ubuntu_20_04_x64_nvidia/Dockerfile +++ b/.github/dockerfiles/ov_build/ubuntu_20_04_x64_nvidia/Dockerfile @@ -35,9 +35,6 @@ RUN apt-get update && \ python3.11-distutils \ # For Java API default-jdk \ - # Compiler \ - gcc-10 \ - g++-10 \ && \ rm -rf /var/lib/apt/lists/* @@ -47,10 +44,6 @@ RUN chmod +x /install_build_dependencies.sh && \ /install_build_dependencies.sh && \ rm -rf /var/lib/apt/lists/* -# Set gcc-10 as a default compiler -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30 - # Install sscache ARG SCCACHE_VERSION="v0.7.5" ENV SCCACHE_HOME="/opt/sccache" \ diff --git a/.github/github_org_control/configs.py b/.github/github_org_control/configs.py index 872638bb657fdf..3df12803c77de0 100644 --- a/.github/github_org_control/configs.py +++ b/.github/github_org_control/configs.py @@ -14,8 +14,8 @@ from pathlib import Path -if sys.version_info[:2] < (3, 8): - raise Exception("Python version must be >= 3.8") +if sys.version_info[:2] < (3, 9): + raise Exception("Python version must be >= 3.9") class ConfigException(Exception): diff --git a/.github/labeler.yml b/.github/labeler.yml index 5421d669ed224f..daa5375b175bd3 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -76,6 +76,8 @@ 'category: NPUW': - 'src/plugins/intel_npu/src/plugin/npuw/**/*' +- 'src/plugins/intel_npu/tests/functional/behavior/npuw/**/*' +- 'src/plugins/intel_npu/tests/unit/behavior/npuw/**/*' 'category: HETERO': - 'src/plugins/hetero/**/*' diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 604ca0fdb81b29..6a163fb5e50043 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -112,7 +112,7 @@ jobs: - name: Pack Artefacts run: | pushd ${BUILD_DIR} - tar -C ${BUILD_DIR} -I pigz -cvf openvino.tgz cov-int + tar -cvf - cov-int | pigz > openvino.tgz popd - name: Submit artefacts diff --git a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml index c2da4c1b2d2f9c..83770900559bab 100644 --- a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml +++ b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml @@ -158,11 +158,11 @@ jobs: run: | pushd ${INSTALL_DIR} - tar -I pigz -cvf ${BUILD_DIR}/openvino_package.tar.gz * + tar -cvf - * | pigz > ${BUILD_DIR}/openvino_package.tar.gz popd pushd ${INSTALL_TEST_DIR} - tar -I pigz -cvf ${BUILD_DIR}/openvino_tests.tar.gz * + tar -cvf - * | pigz > ${BUILD_DIR}/openvino_tests.tar.gz popd # @@ -230,11 +230,11 @@ jobs: - name: Extract OpenVINO packages run: | pushd $INSTALL_DIR - tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd pushd $INSTALL_TEST_DIR - tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} popd - name: Snippets func tests @@ -287,11 +287,11 @@ jobs: - name: Extract OpenVINO packages run: | pushd $INSTALL_DIR - tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd pushd $INSTALL_TEST_DIR - tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} popd - name: Fetch setup_python action diff --git a/.github/workflows/job_build_linux.yml b/.github/workflows/job_build_linux.yml index d58e879c736610..b8eea4375e7e58 100644 --- a/.github/workflows/job_build_linux.yml +++ b/.github/workflows/job_build_linux.yml @@ -182,15 +182,15 @@ jobs: working-directory: ${{ env.BUILD_DIR }} - name: Pack openvino_package - run: tar -I pigz -cvf ${BUILD_DIR}/openvino_package.tar.gz * + run: tar -cvf - * | pigz > ${BUILD_DIR}/openvino_package.tar.gz working-directory: ${{ env.INSTALL_DIR }} - name: Pack openvino_developer_package - run: tar -I pigz -cvf ${BUILD_DIR}/openvino_developer_package.tar.gz * + run: tar -cvf - * | pigz > ${BUILD_DIR}/openvino_developer_package.tar.gz working-directory: ${{ env.DEVELOPER_PACKAGE_DIR }} - name: Pack openvino_tests - run: tar -I pigz -cvf ${BUILD_DIR}/openvino_tests.tar.gz * + run: tar -cvf - * | pigz > ${BUILD_DIR}/openvino_tests.tar.gz working-directory: ${{ env.INSTALL_TEST_DIR }} - name: Build Debian packages diff --git a/.github/workflows/job_cpu_functional_tests.yml b/.github/workflows/job_cpu_functional_tests.yml index 77376d442939a0..24c8542ae80140 100644 --- a/.github/workflows/job_cpu_functional_tests.yml +++ b/.github/workflows/job_cpu_functional_tests.yml @@ -55,15 +55,19 @@ jobs: echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "PARALLEL_TEST_SCRIPT=$GITHUB_WORKSPACE/install/tests/functional_test_utils/layer_tests_summary/run_parallel.py" >> "$GITHUB_ENV" echo "PARALLEL_TEST_CACHE=$GITHUB_WORKSPACE/install/tests/test_cache.lst" >> "$GITHUB_ENV" - + + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz + - name: Extract OpenVINO packages run: | pushd $INSTALL_DIR - tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd pushd $INSTALL_TEST_DIR - tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} popd - name: Fetch setup_python action diff --git a/.github/workflows/job_cxx_unit_tests.yml b/.github/workflows/job_cxx_unit_tests.yml index b83e83af4ed68c..99c363d04d23a7 100644 --- a/.github/workflows/job_cxx_unit_tests.yml +++ b/.github/workflows/job_cxx_unit_tests.yml @@ -60,6 +60,10 @@ jobs: echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "SETUPVARS_COMMAND=${{ env.SOURCE_COMMAND }} $GITHUB_WORKSPACE/install/${{ env.SETUPVARS }}" >> "$GITHUB_ENV" + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz + - name: Setup Variables (Windows) if: ${{ runner.os == 'Windows' }} run: Add-Content -Path $env:GITHUB_ENV -Value "SETUPVARS_COMMAND=${{ env.SOURCE_COMMAND }} ${{ github.workspace }}/install/${{ env.SETUPVARS }}" @@ -68,10 +72,10 @@ jobs: if: ${{ runner.os != 'Windows' }} run: | pushd $INSTALL_DIR - tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd pushd $INSTALL_TEST_DIR - tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} popd - name: Extract OpenVINO packages (Windows) diff --git a/.github/workflows/job_gpu_tests.yml b/.github/workflows/job_gpu_tests.yml index 147afcccddfe17..324e653c57ebab 100644 --- a/.github/workflows/job_gpu_tests.yml +++ b/.github/workflows/job_gpu_tests.yml @@ -59,10 +59,10 @@ jobs: - name: Extract OpenVINO packages run: | pushd $INSTALL_DIR - tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd pushd $INSTALL_TEST_DIR - tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} popd - name: Install dependencies (Linux) diff --git a/.github/workflows/job_jax_models_tests.yml b/.github/workflows/job_jax_models_tests.yml index 2fed97a78e9c07..9956a27f234b36 100644 --- a/.github/workflows/job_jax_models_tests.yml +++ b/.github/workflows/job_jax_models_tests.yml @@ -60,7 +60,7 @@ jobs: - name: Extract OpenVINO packages and tests run: | - tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action diff --git a/.github/workflows/job_onnx_models_tests.yml b/.github/workflows/job_onnx_models_tests.yml index 0eda00f7afb937..321aa88d614310 100644 --- a/.github/workflows/job_onnx_models_tests.yml +++ b/.github/workflows/job_onnx_models_tests.yml @@ -64,7 +64,7 @@ jobs: - name: Extract OpenVINO packages and tests run: | - tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} # Issue 148922 diff --git a/.github/workflows/job_onnx_runtime.yml b/.github/workflows/job_onnx_runtime.yml index 61b13939fc60b7..0ceb080d82184d 100644 --- a/.github/workflows/job_onnx_runtime.yml +++ b/.github/workflows/job_onnx_runtime.yml @@ -59,7 +59,7 @@ jobs: - name: Extract OpenVINO package run: | pushd ${INSTALL_DIR} - tar -I pigz -xf openvino_package.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd - name: Fetch ONNX runtime version and skip tests list diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml index 8db2ebf86dca91..d63262c665d45c 100644 --- a/.github/workflows/job_python_unit_tests.yml +++ b/.github/workflows/job_python_unit_tests.yml @@ -66,9 +66,13 @@ jobs: echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV" - - name: Extract OpenVINO artifacts + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz + + - name: Extract OpenVINO packages run: | - tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml index c4f0d1efb37c75..95074dc84f1ff9 100644 --- a/.github/workflows/job_pytorch_layer_tests.yml +++ b/.github/workflows/job_pytorch_layer_tests.yml @@ -65,11 +65,15 @@ jobs: echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV" - - - name: Extract OpenVINO artifacts (Linux, macOS) + + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz + + - name: Extract OpenVINO packages (Linux, macOS) if: runner.os != 'Windows' run: | - tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - name: Extract OpenVINO artifacts (Windows) diff --git a/.github/workflows/job_pytorch_models_tests.yml b/.github/workflows/job_pytorch_models_tests.yml index ce40dd7f0618ce..a77c1318f3a0c8 100644 --- a/.github/workflows/job_pytorch_models_tests.yml +++ b/.github/workflows/job_pytorch_models_tests.yml @@ -73,7 +73,7 @@ jobs: - name: Extract OpenVINO artifacts run: | - tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action diff --git a/.github/workflows/job_samples_tests.yml b/.github/workflows/job_samples_tests.yml index 12c63644d7b586..7cde4e6fd18eae 100644 --- a/.github/workflows/job_samples_tests.yml +++ b/.github/workflows/job_samples_tests.yml @@ -54,17 +54,17 @@ jobs: echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "BUILD_DIR=$GITHUB_WORKSPACE/build" >> "$GITHUB_ENV" + + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install coreutils pigz - - name: Extract OpenVINO packages, wheels and tests + - name: Extract OpenVINO packages and tests run: | - tar -I pigz -xf openvino_package.tar.gz -C ${INSTALL_DIR} - tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - - name: Install OpenVINO dependencies (mac) - if: runner.os == 'macOS' - run: brew install coreutils - - name: Fetch setup_python action # Python is already installed on Ubuntu within Dockerfile if: runner.os != 'Linux' diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml index 3ad19d3301945f..ae6e91a00d1497 100644 --- a/.github/workflows/job_tensorflow_layer_tests.yml +++ b/.github/workflows/job_tensorflow_layer_tests.yml @@ -66,10 +66,14 @@ jobs: echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV" - - name: Extract OpenVINO artifacts (Linux and macOS) + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz + + - name: Extract OpenVINO packages (Linux, macOS) if: runner.os != 'Windows' run: | - tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - name: Extract OpenVINO artifacts (Windows) diff --git a/.github/workflows/job_tensorflow_models_tests.yml b/.github/workflows/job_tensorflow_models_tests.yml index 76ee01cc76c3ef..db34ec7b793551 100644 --- a/.github/workflows/job_tensorflow_models_tests.yml +++ b/.github/workflows/job_tensorflow_models_tests.yml @@ -65,7 +65,7 @@ jobs: - name: Extract OpenVINO artifacts (Linux and macOS) run: | - tar -I pigz -xf openvino_tests.tar.gz -C . + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - name: Fetch setup_python action diff --git a/.github/workflows/job_tokenizers.yml b/.github/workflows/job_tokenizers.yml index 089b104d7af1d1..238dbfec3a34eb 100644 --- a/.github/workflows/job_tokenizers.yml +++ b/.github/workflows/job_tokenizers.yml @@ -58,6 +58,10 @@ jobs: .github/actions/setup_python .github/actions/cache install_build_dependencies.sh + + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz - name: Setup Python ${{ env.PYTHON_VERSION }} uses: ./.github/actions/setup_python diff --git a/.github/workflows/linux_conditional_compilation.yml b/.github/workflows/linux_conditional_compilation.yml index 6f9b761ce3352c..7b5467b01ad73e 100644 --- a/.github/workflows/linux_conditional_compilation.yml +++ b/.github/workflows/linux_conditional_compilation.yml @@ -200,23 +200,23 @@ jobs: - name: Pack Artifacts run: | pushd ${SELECTIVE_BUILD_STAT_DIR} - tar -I pigz -cvf ${BUILD_DIR}/openvino_selective_build_stat.tar.gz * + tar -cvf - * | pigz > ${BUILD_DIR}/openvino_selective_build_stat.tar.gz popd pushd ${INSTALL_DIR} - tar -I pigz -cvf ${BUILD_DIR}/openvino_package.tar.gz \ - install_dependencies/install_openvino_dependencies.sh + tar -cvf - install_dependencies/install_openvino_dependencies.sh | pigz > ${BUILD_DIR}/openvino_package.tar.gz popd cp -v ${OPENVINO_REPO}/temp/tbb/lib/lib* ${INSTALL_TEST_DIR}/tests pushd ${INSTALL_TEST_DIR} - tar -I pigz -cvf ${BUILD_DIR}/openvino_tests.tar.gz \ + tar -cvf - \ tests/ov_cpu_func_tests \ tests/libopenvino_template_extension.so \ tests/libze_loader.so* \ tests/libhwloc* \ tests/libtbb* \ - tests/functional_test_utils/layer_tests_summary/* + tests/functional_test_utils/layer_tests_summary/* \ + | pigz > ${BUILD_DIR}/openvino_tests.tar.gz popd # @@ -302,7 +302,8 @@ jobs: path: ${{ env.SELECTIVE_BUILD_STAT_DIR }} - name: Extract selective build statistics package - run: tar -I pigz -xvf ${SELECTIVE_BUILD_STAT_DIR}/openvino_selective_build_stat.tar.gz -C ${SELECTIVE_BUILD_STAT_DIR} + run: | + pigz -dc ${SELECTIVE_BUILD_STAT_DIR}/openvino_selective_build_stat.tar.gz | tar -xf - -C ${SELECTIVE_BUILD_STAT_DIR} # # Build diff --git a/.github/workflows/linux_sanitizers.yml b/.github/workflows/linux_sanitizers.yml index cec499e7971130..b23e67a0f2b30e 100644 --- a/.github/workflows/linux_sanitizers.yml +++ b/.github/workflows/linux_sanitizers.yml @@ -175,11 +175,11 @@ jobs: - name: Pack Artifacts run: | pushd ${INSTALL_DIR} - tar -I pigz -cvf ${BUILD_DIR}/openvino_package.tar.gz * + tar -cvf - * | pigz > ${BUILD_DIR}/openvino_package.tar.gz popd pushd ${INSTALL_TEST_DIR} - tar -I pigz -cvf ${BUILD_DIR}/openvino_tests.tar.gz * + tar -cvf - * | pigz > ${BUILD_DIR}/openvino_tests.tar.gz popd # @@ -257,10 +257,10 @@ jobs: - name: Extract OpenVINO packages run: | pushd $INSTALL_DIR - tar -I pigz -xf openvino_package.tar.gz -C $INSTALL_DIR + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd pushd $INSTALL_TEST_DIR - tar -I pigz -xf openvino_tests.tar.gz -C $INSTALL_DIR + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} popd - name: Install dependencies (Linux) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index b91bd65465621a..6e3f344c6dd944 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -77,6 +77,7 @@ jobs: INSTALL_DIR: ${{ github.workspace }}/openvino_install INSTALL_DIR_JS: ${{ github.workspace }}/openvino_install/js INSTALL_TEST_DIR: ${{ github.workspace }}/tests_install + INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels BUILD_DIR: ${{ github.workspace }}/build if: "!needs.smart_ci.outputs.skip_workflow" steps: @@ -104,7 +105,7 @@ jobs: # - name: Install build dependencies - run: brew install coreutils ninja scons + run: brew install coreutils ninja scons pigz - name: Setup Python ${{ env.PYTHON_VERSION }} uses: ./openvino/.github/actions/setup_python @@ -167,16 +168,15 @@ jobs: run: | cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake - cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake + cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake - name: Pack Artifacts run: | pushd ${{ env.INSTALL_DIR }} - tar -I pigz -cvf ${{ env.BUILD_DIR }}/openvino_package.tar.gz * + tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_package.tar.gz popd - pushd ${{ env.INSTALL_TEST_DIR }} - tar -I pigz -cvf ${{ env.BUILD_DIR }}/openvino_tests.tar.gz * + tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz popd - name: Cmake & Build - OpenVINO Contrib @@ -210,6 +210,13 @@ jobs: name: openvino_package path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz if-no-files-found: 'error' + + - name: Upload openvino wheels + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + with: + name: openvino_wheels + path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl + if-no-files-found: 'error' - name: Upload openvino tests package if: ${{ always() }} diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml index 8d4843627e7b9f..16658318de20d8 100644 --- a/.github/workflows/mac_arm64.yml +++ b/.github/workflows/mac_arm64.yml @@ -77,6 +77,7 @@ jobs: INSTALL_DIR: ${{ github.workspace }}/openvino_install INSTALL_DIR_JS: ${{ github.workspace }}/openvino_install/js INSTALL_TEST_DIR: ${{ github.workspace }}/tests_install + INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels BUILD_DIR: ${{ github.workspace }}/build if: "!needs.smart_ci.outputs.skip_workflow" steps: @@ -104,7 +105,7 @@ jobs: # - name: Install build dependencies - run: brew install coreutils ninja scons + run: brew install coreutils ninja scons pigz - name: Setup Python ${{ env.PYTHON_VERSION }} uses: ./openvino/.github/actions/setup_python @@ -167,16 +168,16 @@ jobs: run: | cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake - cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake + cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake - name: Pack Artifacts run: | pushd ${{ env.INSTALL_DIR }} - tar -I pigz -cvf ${{ env.BUILD_DIR }}/openvino_package.tar.gz * + tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_package.tar.gz popd pushd ${{ env.INSTALL_TEST_DIR }} - tar -I pigz -cvf ${{ env.BUILD_DIR }}/openvino_tests.tar.gz * + tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz popd - name: Cmake & Build - OpenVINO Contrib @@ -210,6 +211,13 @@ jobs: name: openvino_package path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz if-no-files-found: 'error' + + - name: Upload openvino wheels + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + with: + name: openvino_wheels + path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl + if-no-files-found: 'error' - name: Upload openvino tests package if: ${{ always() }} diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index 19f63471523726..2ebca2b059fdd2 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -176,10 +176,10 @@ jobs: - name: Extract OpenVINO packages run: | pushd ${INSTALL_DIR} - tar -I pigz -xf openvino_package.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd pushd ${INSTALL_TEST_DIR} - tar -I pigz -xf openvino_tests.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} popd - name: Fetch setup_python action @@ -222,7 +222,7 @@ jobs: if: ${{ always() }} run: | pushd ${CONFORMANCE_ARTIFACTS_DIR} - tar -I pigz -cvf ${CONFORMANCE_ARTIFACTS_DIR}/conformance_artifacts.tar.gz * + tar -cvf - * | pigz > ${CONFORMANCE_ARTIFACTS_DIR}/conformance_artifacts.tar.gz popd - name: Upload Conformance Artifacts @@ -248,7 +248,7 @@ jobs: if: ${{ matrix.TEST_TYPE == 'API' }} run: | pushd ${CONFORMANCE_ARTIFACTS_DIR} - tar -I pigz -cvf ${CONFORMANCE_ARTIFACTS_DIR}/conformance_artifacts.tar.gz * + tar -cvf - * | pigz > ${CONFORMANCE_ARTIFACTS_DIR}/conformance_artifacts.tar.gz popd - name: Upload Conformance Artifacts @@ -451,11 +451,11 @@ jobs: - name: Extract OpenVINO packages run: | pushd ${INSTALL_DIR} - tar -I pigz -xf openvino_package.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd pushd ${INSTALL_DIR} - tar -I pigz -xf openvino_developer_package.tar.gz -C ${INSTALL_DIR} + pigz -dc openvino_developer_package.tar.gz | tar -xf - -C ${INSTALL_DIR} popd - name: Clone OpenVINO Contrib diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7169ebc2ba2c9b..c30ce12665ab33 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -115,6 +115,7 @@ Choose the component your Good First Issue is related to. You can run tests to m - [C API](https://github.com/openvinotoolkit/openvino/tree/master/src/bindings/c) - [Core](https://github.com/openvinotoolkit/openvino/tree/master/src/core) - [Python API](https://github.com/openvinotoolkit/openvino/tree/master/src/bindings/python) +- [Node.js API](https://github.com/openvinotoolkit/openvino/tree/master/src/bindings/js/node) ##### Frontends - [IR Frontend](https://github.com/openvinotoolkit/openvino/tree/master/src/frontends/ir) diff --git a/cmake/developer_package/ncc_naming_style/ncc_naming_style.cmake b/cmake/developer_package/ncc_naming_style/ncc_naming_style.cmake index d20582b03cb9fc..67a58d56e901e2 100644 --- a/cmake/developer_package/ncc_naming_style/ncc_naming_style.cmake +++ b/cmake/developer_package/ncc_naming_style/ncc_naming_style.cmake @@ -80,11 +80,11 @@ if(ENABLE_NCC_STYLE) set(CMAKE_FIND_LIBRARY_PREFIXES ${_old_CMAKE_FIND_LIBRARY_PREFIXES}) set(CMAKE_FIND_LIBRARY_SUFFIXES ${_old_CMAKE_FIND_LIBRARY_SUFFIXES}) else() - find_host_package(Clang QUIET) - endif() - - if(Clang_FOUND AND TARGET libclang) - get_target_property(libclang_location libclang LOCATION) + find_host_library(libclang_location + NAMES clang libclang libclang-${clang_version} libclang-${clang_version}.so libclang-${clang_version}.so.1 + PATHS /usr/lib /usr/local/lib /usr/lib/llvm-${clang_version}/lib /usr/lib/x86_64-linux-gnu + NO_DEFAULT_PATH + NO_CMAKE_FIND_ROOT_PATH) endif() if(NOT libclang_location) diff --git a/cmake/developer_package/ncc_naming_style/requirements_dev.txt b/cmake/developer_package/ncc_naming_style/requirements_dev.txt index a304b713cb3a2c..724ea2bf15721d 100644 --- a/cmake/developer_package/ncc_naming_style/requirements_dev.txt +++ b/cmake/developer_package/ncc_naming_style/requirements_dev.txt @@ -1,4 +1,3 @@ -clang==12.0.1; python_version == '3.8' clang==12.0.1; python_version == '3.9' clang==14.0; python_version == '3.10' clang==14.0; python_version == '3.11' diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index eedfe078cbd552..2dfb6bb8d04e81 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -8,7 +8,6 @@ set(ENABLE_CPP_API OFF CACHE BOOL "Build with C/C++ API.") set(ENABLE_PYTHON_API OFF CACHE BOOL "Build with Python API.") set(ENABLE_GENAI_API OFF CACHE BOOL "Build with GenAI API.") set(ENABLE_NOTEBOOKS OFF CACHE BOOL "Build with openvino notebooks.") -set(ENABLE_OMZ OFF CACHE BOOL "Build with open_model_zoo.") set(ENABLE_OVMS OFF CACHE BOOL "Build with ovms.") set(OVMS_DOCS_DIR "" CACHE PATH "Path to model server documentation dir.") @@ -90,17 +89,6 @@ function(build_docs) list(APPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "FINISHED preprocessing OVMS") endif() - if(${ENABLE_OMZ}) - list(APPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "STARTED preprocessing OMZ") - list(APPEND commands - COMMAND ${Python3_EXECUTABLE} ${OpenVINO_SOURCE_DIR}/thirdparty/open_model_zoo/ci/prepare-documentation.py ${CMAKE_BINARY_DIR}/open_model_zoo) - list(APPEND commands COMMAND ${Python3_EXECUTABLE} ${FILE_HELPER_SCRIPT} - --filetype=md - --input_dir=${CMAKE_BINARY_DIR}/open_model_zoo - --output_dir=${SPHINX_SOURCE_DIR} - --exclude_dir=${SPHINX_SOURCE_DIR}) - list(APPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "FINISHED preprocessing OMZ") - endif() # Preprocess docs add_custom_target(preprocess_docs diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst index aa60c44a2ad5c8..40b94210f6c43d 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks.rst @@ -18,7 +18,7 @@ Performance Benchmarks This page presents benchmark results for `Intel® Distribution of OpenVINO™ toolkit `__ -and :doc:`OpenVINO Model Server <../ovms_what_is_openvino_model_server>`, for a representative +and :doc:`OpenVINO Model Server <../openvino-workflow/model-server/ovms_what_is_openvino_model_server>`, for a representative selection of public neural networks and Intel® devices. The results may help you decide which hardware to use in your applications or plan AI workload for the hardware you have already implemented in your solutions. Click the buttons below to see the chosen benchmark data. diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst index ea76392be4e2e6..2d5598a5eb8e9d 100644 --- a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst +++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst @@ -17,7 +17,7 @@ In this release, one person performs the role of both the Model Developer and th Overview ######## -The OpenVINO™ Security Add-on works with the :doc:`OpenVINO™ Model Server <../../ovms_what_is_openvino_model_server>` on Intel® architecture. Together, the OpenVINO™ Security Add-on and the OpenVINO™ Model Server provide a way for Model Developers and Independent Software Vendors to use secure packaging and secure model execution to enable access control to the OpenVINO™ models, and for model Users to run inference within assigned limits. +The OpenVINO™ Security Add-on works with the :doc:`OpenVINO™ Model Server <../../openvino-workflow/model-server/ovms_what_is_openvino_model_server>` on Intel® architecture. Together, the OpenVINO™ Security Add-on and the OpenVINO™ Model Server provide a way for Model Developers and Independent Software Vendors to use secure packaging and secure model execution to enable access control to the OpenVINO™ models, and for model Users to run inference within assigned limits. The OpenVINO™ Security Add-on consists of three components that run in Kernel-based Virtual Machines (KVMs). These components provide a way to run security-sensitive operations in an isolated environment. A brief description of the three components are as follows. Click each triangled line for more information about each. diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst index a77527db114bc7..4585ca97488023 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst @@ -102,7 +102,7 @@ Use the following code snippet to change the default settings: .. code-block:: python - pipeline_config = { "MAX_PROMPT_LEN": 1500, "MIN_RESPONSE_LEN": 500 } + pipeline_config = { "MAX_PROMPT_LEN": 1024, "MIN_RESPONSE_LEN": 512 } pipe = ov_genai.LLMPipeline(model_path, "NPU", pipeline_config) .. tab-item:: C++ @@ -110,7 +110,7 @@ Use the following code snippet to change the default settings: .. code-block:: cpp - ov::AnyMap pipeline_config = { { "MAX_PROMPT_LEN", 1500 }, { "MIN_RESPONSE_LEN", 500 } }; + ov::AnyMap pipeline_config = { { "MAX_PROMPT_LEN", 1024 }, { "MIN_RESPONSE_LEN", 512 } }; ov::genai::LLMPipeline pipe(model_path, "NPU", pipeline_config); diff --git a/docs/articles_en/openvino-workflow.rst b/docs/articles_en/openvino-workflow.rst index 0dda91f91fb552..942d6ed4b13a96 100644 --- a/docs/articles_en/openvino-workflow.rst +++ b/docs/articles_en/openvino-workflow.rst @@ -89,7 +89,7 @@ OpenVINO uses the following functions for reading, converting, and saving models | Deploy a model locally, reading the file directly from your application and utilizing about-openvino/additional-resources available to the system. | Deployment on a local system uses the steps described in the section on running inference. -| :doc:`Deployment Option 2. Using Model Server ` +| :doc:`Deployment Option 2. Using Model Server ` | Deploy a model remotely, connecting your application to an inference server and utilizing external about-openvino/additional-resources, with no impact on the app's performance. | Deployment on OpenVINO Model Server is quick and does not require any additional steps described in the section on running inference. diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst index 6c85473502ff9b..6348ca897c5ea5 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst @@ -1,5 +1,5 @@ -Weight Compression -================== +LLM Weight Compression +========================= .. toctree:: :maxdepth: 1 @@ -187,7 +187,7 @@ trade-offs after optimization: ratio=0.9, ) -* ``scale_estimation`` - boolean parameter that enables more accurate estimation of +* ``scale_estimation`` - boolean parameter that enables more accurate estimation of quantization scales. Especially helpful when the weights of all layers are quantized to 4 bits. Requires dataset. diff --git a/docs/articles_en/openvino-workflow/model-optimization.rst b/docs/articles_en/openvino-workflow/model-optimization.rst index b4b6cc64acb21b..f5a5f97341e960 100644 --- a/docs/articles_en/openvino-workflow/model-optimization.rst +++ b/docs/articles_en/openvino-workflow/model-optimization.rst @@ -22,7 +22,7 @@ It is a `set of compression algorithms `__ and -`NNCF API documentation `__. + +Recommended workflows +########################## + +* A common approach for most cases is to: + + 1. Perform post-training quantization first, as it is the easiest option. + 2. For even better results, combine post-training quantization with filter pruning. + 3. If the accuracy drop is unacceptable, use quantization-aware training instead. It will give + you the same level of performance boost, with a smaller impact on accuracy. + +* **Weight compression** works **only with LLMs**. Do not try to use it with other models. +* For **visual-multimodal** use cases, the encoder / decoder split approach may be recommended. + + + + + .. image:: ../assets/images/DEVELOPMENT_FLOW_V3_crunch.svg + +Installation and usage +########################### + +To learn about the full scope of the framework, its installation, and technical details, visit +both `the NNCF repository `__ and +`NNCF API documentation `__. + + + .. tab-set:: .. tab-item:: Installation diff --git a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst index 249fc8c4884cc1..86788b20249a3f 100644 --- a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst +++ b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst @@ -140,4 +140,4 @@ sequences. You can find more examples demonstrating how to work with states in other articles: * `LLM Chatbot notebook <../../notebooks/stable-zephyr-3b-chatbot-with-output.html>`__ -* :doc:`Serving Stateful Models with OpenVINO Model Server <../../ovms_docs_stateful_models>` +* :doc:`Serving Stateful Models with OpenVINO Model Server <../../openvino-workflow/model-server/ovms_docs_stateful_models>` diff --git a/docs/documentation_build_instructions.md b/docs/documentation_build_instructions.md index 490da1b1029bd3..d9219454b86a19 100644 --- a/docs/documentation_build_instructions.md +++ b/docs/documentation_build_instructions.md @@ -45,5 +45,4 @@ Depending on the needs, following variables can be added to first cmake call: - building C/C++ API: `-DENABLE_CPP_API=ON` - building Python API: `-DENABLE_PYTHON_API=ON` - building Notebooks: `-DENABLE_NOTEBOOKS=ON` -- building OMZ: `-DENABLE_OMZ=ON` - building OVMS: `-DENABLE_OVMS=ON -DOVMS_DOCS_DIR=` diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-Performance-Data.xlsx b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-Performance-Data.xlsx index 057e132d384167..9b53d90e0862db 100644 Binary files a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-Performance-Data.xlsx and b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-Performance-Data.xlsx differ diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-platform_list.pdf b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-platform_list.pdf index 6782dea6d0f3b2..cba78e5244acf1 100644 Binary files a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-platform_list.pdf and b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-platform_list.pdf differ diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-system-info-detailed.xlsx b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-system-info-detailed.xlsx index 7c4d9cd0e40919..4e243b8190c876 100644 Binary files a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-system-info-detailed.xlsx and b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.4-system-info-detailed.xlsx differ diff --git a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms.json b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms.json index 8457e2be7a6f4a..18a36073d582f5 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms.json +++ b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms.json @@ -1,1102 +1,1047 @@ [ { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "bert-base-cased", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 479.649, - "int8": 482.878, - "ovmsfp32": 180.7, - "fp32": 179.541 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "bert-base-cased", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 159.534, + "fp32_ovms": 157.334, + "int8_ov": 432.339, + "int8_ovms": 420.793 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "bert-base-cased", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 428.173, - "int8": 430.397, - "ovmsfp32": 156.73, - "fp32": 159.276 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "bert-large-uncased-whole-word-masking-squad-0001", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 13.125, + "fp32_ovms": 13.254, + "int8_ov": 38.151, + "int8_ovms": 37.623 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "bert-base-cased", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 100.783, - "int8": 101.983, - "ovmsfp32": 35.711, - "fp32": 36.35 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "efficientdet-d0", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 247.445, + "fp32_ovms": 253.09, + "int8_ov": 413.083, + "int8_ovms": 377.844 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "bert-base-cased", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 98.441, - "int8": 102.62, - "ovmsfp32": 34.303, - "fp32": 36.096 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "mask_rcnn_resnet50_atrous_coco", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 1.638, + "fp32_ovms": 1.714, + "int8_ov": 6.202, + "int8_ovms": 6.126 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "bert-base-cased", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 26.185, - "int8": 26.436, - "ovmsfp32": 17.108, - "fp32": 17.395 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "mobilenet-v2", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 3333.399, + "fp32_ovms": 2905.171, + "int8_ov": 10422.241, + "int8_ovms": 7461.99 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "bert-large-uncased", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 41.872, - "int8": 42.401, - "ovmsfp32": 14.949, - "fp32": 14.473 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "resnet-50", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 575.208, + "fp32_ovms": 569.925, + "int8_ov": 2199.072, + "int8_ovms": 2064.581 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "bert-large-uncased", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 37.05, - "int8": 37.864, - "ovmsfp32": 13.075, - "fp32": 13.031 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "ssd-resnet34-1200", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 10.598, + "fp32_ovms": 10.472, + "int8_ov": 40.683, + "int8_ovms": 38.737 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "bert-large-uncased", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 10.047, - "int8": 10.111, - "ovmsfp32": 3.259, - "fp32": 3.237 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "ssd_mobilenet_v1_coco", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 1219.441, + "fp32_ovms": 1201.096, + "int8_ov": 4400.471, + "int8_ovms": 4270.702 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "bert-large-uncased", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 9.961, - "int8": 10.167, - "ovmsfp32": 3.236, - "fp32": 3.224 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "unet-camvid-onnx-0001", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 15.924, + "fp32_ovms": 15.763, + "int8_ov": 67.731, + "int8_ovms": 64.658 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "bert-large-uncased", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 2.43, - "int8": 2.427, - "ovmsfp32": 1.447, - "fp32": 1.428 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "yolo_v5m", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 74.189, + "fp32_ovms": 68.788, + "int8_ov": 247.757, + "int8_ovms": 180.302 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "Efficientdet-D0", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 439.435, - "int8": 485.287, - "ovmsfp32": 274.772, - "fp32": 272.856 + "Platform": "Intel® Xeon® Gold 6238M", + "Model": "yolo_v8n", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 332.326, + "fp32_ovms": 278.054, + "int8_ov": 740.985, + "int8_ovms": 609.062 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "Efficientdet-D0", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 376.1, - "int8": 415.275, - "ovmsfp32": 253.829, - "fp32": 259.188 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "bert-base-cased", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 182.025, + "fp32_ovms": 180.764, + "int8_ov": 485.82, + "int8_ovms": 472.842 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "Efficientdet-D0", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 131.735, - "int8": 148.558, - "ovmsfp32": 57.036, - "fp32": 59.907 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "bert-large-uncased-whole-word-masking-squad-0001", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 14.625, + "fp32_ovms": 15.132, + "int8_ov": 42.906, + "int8_ovms": 42.406 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "Efficientdet-D0", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 119.798, - "int8": 140.129, - "ovmsfp32": "", - "fp32": "" + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "efficientdet-d0", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 288.531, + "fp32_ovms": 278.548, + "int8_ov": 483.438, + "int8_ovms": 443.032 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "Efficientdet-D0", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 47.382, - "int8": 50.573, - "ovmsfp32": 30.226, - "fp32": 31.492 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "mask_rcnn_resnet50_atrous_coco", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 1.872, + "fp32_ovms": 1.95, + "int8_ov": 6.856, + "int8_ovms": 6.763 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 6.306, - "int8": 6.364, - "ovmsfp32": 1.96, - "fp32": 1.868 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "mobilenet-v2", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 3909.405, + "fp32_ovms": 3327.621, + "int8_ov": 12375.018, + "int8_ovms": 7554.235 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 5.652, - "int8": 5.771, - "ovmsfp32": 1.714, - "fp32": 1.639 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "resnet-50", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 634.732, + "fp32_ovms": 634.102, + "int8_ov": 2481.256, + "int8_ovms": 2349.872 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 1.309, - "int8": 1.267, - "ovmsfp32": 0.396, - "fp32": 0.371 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "ssd-resnet34-1200", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 12.166, + "fp32_ovms": 12.027, + "int8_ov": 47.295, + "int8_ovms": 44.525 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 1.293, - "int8": 1.271, - "ovmsfp32": 0.355, - "fp32": 0.346 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "ssd_mobilenet_v1_coco", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 1384.145, + "fp32_ovms": 1356.126, + "int8_ov": 5037.197, + "int8_ovms": 4834.045 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 0.38, - "int8": 0.352, - "ovmsfp32": 0.182, - "fp32": 0.151 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "unet-camvid-onnx-0001", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 18.26, + "fp32_ovms": 18.052, + "int8_ov": 77.933, + "int8_ovms": 73.527 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "Mobilenet-V2", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 7563.199, - "int8": 12406.597, - "ovmsfp32": 3336.015, - "fp32": 3972.673 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "yolo_v5m", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 85.149, + "fp32_ovms": 78.205, + "int8_ov": 281.889, + "int8_ovms": 204.353 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "Mobilenet-V2", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 7475.62, - "int8": 10373.146, - "ovmsfp32": 2934.976, - "fp32": 3381.725 + "Platform": "Intel® Xeon® Platinum 8260M", + "Model": "yolo_v8n", + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 376.079, + "fp32_ovms": 312.181, + "int8_ov": 801.556, + "int8_ovms": 678.929 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "Mobilenet-V2", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 2158.818, - "int8": 2742.363, - "ovmsfp32": 740.988, - "fp32": 874.037 + "Platform": "Intel® Core™ i7-11700K", + "Model": "bert-base-cased", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 35.915, + "fp32_ovms": 34.381, + "int8_ov": 101.976, + "int8_ovms": 99.024 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "Mobilenet-V2", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 2042.633, - "int8": 2809.471, - "ovmsfp32": 631.59, - "fp32": 759.984 + "Platform": "Intel® Core™ i7-11700K", + "Model": "bert-large-uncased-whole-word-masking-squad-0001", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 3.232, + "fp32_ovms": 3.266, + "int8_ov": 10.132, + "int8_ovms": 10.133 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "Mobilenet-V2", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 614.174, - "int8": 718.416, - "ovmsfp32": 381.882, - "fp32": 455.793 + "Platform": "Intel® Core™ i7-11700K", + "Model": "efficientdet-d0", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 51.747, + "fp32_ovms": 48.906, + "int8_ov": 142.489, + "int8_ovms": 124.167 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "Resnet-50", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 2356.238, - "int8": 2483.3, - "ovmsfp32": 628.616, - "fp32": 635.411 + "Platform": "Intel® Core™ i7-11700K", + "Model": "mask_rcnn_resnet50_atrous_coco", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 0.352, + "fp32_ovms": 0.364, + "int8_ov": 1.322, + "int8_ovms": 1.336 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "Resnet-50", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 2071.836, - "int8": 2202.317, - "ovmsfp32": 568.945, - "fp32": 575.057 + "Platform": "Intel® Core™ i7-11700K", + "Model": "mobilenet-v2", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 795.18, + "fp32_ovms": 664.842, + "int8_ov": 2721.454, + "int8_ovms": 2063.761 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "Resnet-50", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 440.533, - "int8": 458.665, - "ovmsfp32": 113.442, - "fp32": 116.116 + "Platform": "Intel® Core™ i7-11700K", + "Model": "resnet-50", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 114.859, + "fp32_ovms": 110.835, + "int8_ov": 467.591, + "int8_ovms": 445.408 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "Resnet-50", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 441.7, - "int8": 469.848, - "ovmsfp32": 107.395, - "fp32": 113.605 + "Platform": "Intel® Core™ i7-11700K", + "Model": "ssd-resnet34-1200", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 2.053, + "fp32_ovms": 2.074, + "int8_ov": 8.023, + "int8_ovms": 7.987 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "Resnet-50", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 114.045, - "int8": 118.024, - "ovmsfp32": 57.165, - "fp32": 58.366 + "Platform": "Intel® Core™ i7-11700K", + "Model": "ssd_mobilenet_v1_coco", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 260.104, + "fp32_ovms": 250.094, + "int8_ov": 991.064, + "int8_ovms": 930.128 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "SSD-Resnet34-1200", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 44.499, - "int8": 47.251, - "ovmsfp32": 12.074, - "fp32": 12.167 + "Platform": "Intel® Core™ i7-11700K", + "Model": "unet-camvid-onnx-0001", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 3.273, + "fp32_ovms": 3.3, + "int8_ov": 12.884, + "int8_ovms": 12.727 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "SSD-Resnet34-1200", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 38.714, - "int8": 40.662, - "ovmsfp32": 10.504, - "fp32": 10.653 + "Platform": "Intel® Core™ i7-11700K", + "Model": "yolo_v5m", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 14.714, + "fp32_ovms": 14.243, + "int8_ov": 55.058, + "int8_ovms": 47.548 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "SSD-Resnet34-1200", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 7.756, - "int8": 7.818, - "ovmsfp32": 2.029, - "fp32": 2.005 + "Platform": "Intel® Core™ i7-11700K", + "Model": "yolo_v8n", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 71.446, + "fp32_ovms": 64.775, + "int8_ov": 200.864, + "int8_ovms": 144.792 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "SSD-Resnet34-1200", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 7.929, - "int8": 8.032, - "ovmsfp32": 2.072, - "fp32": 2.054 + "Platform": "Intel® Core™ i9-11900K", + "Model": "bert-base-cased", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 36.227, + "fp32_ovms": 35.646, + "int8_ov": 101.562, + "int8_ovms": 100.382 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "SSD-Resnet34-1200", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 1.947, - "int8": 1.937, - "ovmsfp32": 1.037, - "fp32": 1.008 + "Platform": "Intel® Core™ i9-11900K", + "Model": "bert-large-uncased-whole-word-masking-squad-0001", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 3.23, + "fp32_ovms": 3.254, + "int8_ov": 10.05, + "int8_ovms": 10.092 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "SSD_Mobilenet_V1_Coco", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 4732.691, - "int8": 4875.291, - "ovmsfp32": 1362.268, - "fp32": 1375.237 + "Platform": "Intel® Core™ i9-11900K", + "Model": "efficientdet-d0", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 59.759, + "fp32_ovms": 55.851, + "int8_ov": 149.505, + "int8_ovms": 131.453 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "SSD_Mobilenet_V1_Coco", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 4168.575, - "int8": 4279.825, - "ovmsfp32": 1199.883, - "fp32": 1226.189 + "Platform": "Intel® Core™ i9-11900K", + "Model": "mask_rcnn_resnet50_atrous_coco", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 0.368, + "fp32_ovms": 0.394, + "int8_ov": 1.308, + "int8_ovms": 1.338 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "SSD_Mobilenet_V1_Coco", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 921.041, - "int8": 1001.672, - "ovmsfp32": 268.066, - "fp32": 280.987 + "Platform": "Intel® Core™ i9-11900K", + "Model": "mobilenet-v2", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 865.806, + "fp32_ovms": 734.822, + "int8_ov": 2743.201, + "int8_ovms": 2163.412 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "SSD_Mobilenet_V1_Coco", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 915.4, - "int8": 1028.233, - "ovmsfp32": 244.534, - "fp32": 260.822 + "Platform": "Intel® Core™ i9-11900K", + "Model": "resnet-50", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 116.784, + "fp32_ovms": 113.046, + "int8_ov": 457.358, + "int8_ovms": 440.924 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "SSD_Mobilenet_V1_Coco", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 256.018, - "int8": 266.401, - "ovmsfp32": 129.917, - "fp32": 135.312 + "Platform": "Intel® Core™ i9-11900K", + "Model": "ssd-resnet34-1200", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 2.006, + "fp32_ovms": 2.031, + "int8_ov": 7.817, + "int8_ovms": 7.75 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "Unet-Camvid--0001", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 73.429, - "int8": 77.693, - "ovmsfp32": 18.104, - "fp32": 17.938 + "Platform": "Intel® Core™ i9-11900K", + "Model": "ssd_mobilenet_v1_coco", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 274.42, + "fp32_ovms": 264.153, + "int8_ov": 997.987, + "int8_ovms": 915.681 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "Unet-Camvid--0001", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 64.29, - "int8": 67.517, - "ovmsfp32": 15.777, - "fp32": 15.927 + "Platform": "Intel® Core™ i9-11900K", + "Model": "unet-camvid-onnx-0001", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 3.246, + "fp32_ovms": 3.272, + "int8_ov": 12.668, + "int8_ovms": 12.585 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "Unet-Camvid--0001", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 12.574, - "int8": 12.628, - "ovmsfp32": 3.267, - "fp32": 3.253 + "Platform": "Intel® Core™ i9-11900K", + "Model": "yolo_v5m", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 14.985, + "fp32_ovms": 14.514, + "int8_ov": 54.937, + "int8_ovms": 47.767 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "Unet-Camvid--0001", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 12.718, - "int8": 12.881, - "ovmsfp32": 3.272, - "fp32": 3.297 + "Platform": "Intel® Core™ i9-11900K", + "Model": "yolo_v8n", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 74.1, + "fp32_ovms": 67.472, + "int8_ov": 203.493, + "int8_ovms": 151.175 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "Unet-Camvid--0001", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 2.995, - "int8": 2.976, - "ovmsfp32": 1.555, - "fp32": 1.53 + "Platform": "Intel® Core™ i3-10100", + "Model": "bert-base-cased", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 17.054, + "fp32_ovms": 17.124, + "int8_ov": 26.043, + "int8_ovms": 25.872 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "Yolo_V3_Tiny", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 1842.129, - "int8": 2317.052, - "ovmsfp32": 755.451, - "fp32": 777.681 + "Platform": "Intel® Core™ i3-10100", + "Model": "bert-large-uncased-whole-word-masking-squad-0001", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 1.434, + "fp32_ovms": 1.456, + "int8_ov": 2.421, + "int8_ovms": 2.450 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "Yolo_V3_Tiny", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 1667.812, - "int8": 2056.27, - "ovmsfp32": 675.447, - "fp32": 704.412 + "Platform": "Intel® Core™ i3-10100", + "Model": "efficientdet-d0", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 31.321, + "fp32_ovms": 30.316, + "int8_ov": 50.629, + "int8_ovms": 47.377 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "Yolo_V3_Tiny", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 431.387, - "int8": 504.093, - "ovmsfp32": 145.92, - "fp32": 151.499 + "Platform": "Intel® Core™ i3-10100", + "Model": "mask_rcnn_resnet50_atrous_coco", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 0.151, + "fp32_ovms": 0.182, + "int8_ov": 0.361, + "int8_ovms": 0.389 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "Yolo_V3_Tiny", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 409.268, - "int8": 516.794, - "ovmsfp32": 139.903, - "fp32": 147.235 + "Platform": "Intel® Core™ i3-10100", + "Model": "mobilenet-v2", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 442.763, + "fp32_ovms": 380.661, + "int8_ov": 724.232, + "int8_ovms": 617.393 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "Yolo_V3_Tiny", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 117.276, - "int8": 133.578, - "ovmsfp32": 65.341, - "fp32": 69.29 + "Platform": "Intel® Core™ i3-10100", + "Model": "resnet-50", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 57.978, + "fp32_ovms": 57.038, + "int8_ov": 118.213, + "int8_ovms": 113.691 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® 8260M CPU-only", - "Model": "Yolo_V8n", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": "", - "int8": "", - "ovmsfp32": 314.652, - "fp32": 386.299 + "Platform": "Intel® Core™ i3-10100", + "Model": "ssd-resnet34-1200", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 1.0, + "fp32_ovms": 1.031, + "int8_ov": 1.937, + "int8_ovms": 1.954 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Xeon® Gold 6238M CPU-only", - "Model": "Yolo_V8n", - "Checked": true, - "PlatformType": "Mobile Platforms (Intel® Atom™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": "", - "int8": "", - "ovmsfp32": 282.302, - "fp32": 340.845 + "Platform": "Intel® Core™ i3-10100", + "Model": "ssd_mobilenet_v1_coco", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 133.421, + "fp32_ovms": 129.949, + "int8_ov": 267.141, + "int8_ovms": 256.821 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i9-11900K CPU-only", - "Model": "Yolo_V8n", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 153.817, - "int8": 204.691, - "ovmsfp32": 67.421, - "fp32": 74.996 + "Platform": "Intel® Core™ i3-10100", + "Model": "unet-camvid-onnx-0001", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 1.515, + "fp32_ovms": 1.534, + "int8_ov": 2.96, + "int8_ovms": 2.973 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i7-11700K CPU-only", - "Model": "Yolo_V8n", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 143.19, - "int8": 197.409, - "ovmsfp32": 62.948, - "fp32": 70.913 + "Platform": "Intel® Core™ i3-10100", + "Model": "yolo_v5m", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 7.691, + "fp32_ovms": 7.511, + "int8_ov": 14.919, + "int8_ovms": 13.832 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } }, { - "Platform": "Intel® Core™ i3-10100 CPU-only", - "Model": "Yolo_V8n", - "Checked": true, - "PlatformType": "Client Platforms (Intel® Core™)", - "Parameters": { - "throughput": { - "Precisions": [ - { - "ovmsint8": 56.244, - "int8": 67.968, - "ovmsfp32": 34.396, - "fp32": 38.576 + "Platform": "Intel® Core™ i3-10100", + "Model": "yolo_v8n", + "PlatformType": "Client Platforms (Intel® Core™)", + "Parameters": { + "throughput": { + "Precisions": [ + { + "fp32_ov": 38.482, + "fp32_ovms": 34.513, + "int8_ov": 68.126, + "int8_ovms": 55.698 + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" } - } } ] \ No newline at end of file diff --git a/docs/sphinx_setup/_static/benchmarks_files/graph-config.json b/docs/sphinx_setup/_static/benchmarks_files/graph-config.json index df371e23c8e6eb..6fb8d19e1a1adf 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/graph-config.json +++ b/docs/sphinx_setup/_static/benchmarks_files/graph-config.json @@ -1,13 +1,15 @@ { "PrecisionsMap": { - "OVMSINT8": "ovmsint8", - "OVMSFP32": "ovmsfp32", "INT4": "int4", "INT8": "int8", "FP16": "fp16", "FP32": "fp32", - "BF16": "bf16" + "BF16": "bf16", + "FP32_OV": "fp32_ov", + "FP32_OVMS": "fp32_ovms", + "INT8_OV": "int8_ov", + "INT8_OVMS": "int8_ovms" }, "ParametersMap": { "Throughput": "throughput", @@ -27,7 +29,7 @@ "PrecisionData": { "int4": { "data": null, - "color": "#5bd0f0", + "color": "#76D8F6", "label": "INT4" }, "int8": { @@ -50,15 +52,25 @@ "color": "#00536a", "label": "BF16" }, - "ovmsint8": { + "fp32_ov": { + "data": null, + "color": "#76D8F6", + "label": "FP32 OV" + }, + "fp32_ovms": { + "data": null, + "color": "#00C7FD", + "label": "FP32 OVMS" + }, + "int8_ov": { "data": null, "color": "#009fca", - "label": "FPS( OV Ref. INT8)" + "label": "INT8 OV" }, - "ovmsfp32": { + "int8_ovms": { "data": null, "color": "#00536a", - "label": "BF16" + "label": "INT8 OVMS" } }, "Filters": [ diff --git a/scripts/setupvars/setupvars.bat b/scripts/setupvars/setupvars.bat index fac3e7f66c4ed4..8a09d974ecb295 100644 --- a/scripts/setupvars/setupvars.bat +++ b/scripts/setupvars/setupvars.bat @@ -67,7 +67,7 @@ set "PATH=%OPENVINO_LIB_PATHS%;%PATH%" :: Check if Python is installed set PYTHON_VERSION_MAJOR=3 -set MIN_REQUIRED_PYTHON_VERSION_MINOR=8 +set MIN_REQUIRED_PYTHON_VERSION_MINOR=9 set MAX_SUPPORTED_PYTHON_VERSION_MINOR=13 python --version 2>NUL diff --git a/scripts/setupvars/setupvars.ps1 b/scripts/setupvars/setupvars.ps1 index 7dacef5df4306b..2f0f960c1a08e3 100644 --- a/scripts/setupvars/setupvars.ps1 +++ b/scripts/setupvars/setupvars.ps1 @@ -63,7 +63,7 @@ Write-Host "[setupvars] OpenVINO environment initialized" # Check if Python is installed $PYTHON_VERSION_MAJOR = 3 -$MIN_REQUIRED_PYTHON_VERSION_MINOR = 8 +$MIN_REQUIRED_PYTHON_VERSION_MINOR = 9 $MAX_SUPPORTED_PYTHON_VERSION_MINOR = 13 try diff --git a/scripts/setupvars/setupvars.sh b/scripts/setupvars/setupvars.sh index 3b4fb9407f9090..422bc4a035dd8b 100755 --- a/scripts/setupvars/setupvars.sh +++ b/scripts/setupvars/setupvars.sh @@ -100,7 +100,7 @@ if command -v lsb_release >/dev/null 2>&1; then fi PYTHON_VERSION_MAJOR="3" -MIN_REQUIRED_PYTHON_VERSION_MINOR="8" +MIN_REQUIRED_PYTHON_VERSION_MINOR="9" MAX_SUPPORTED_PYTHON_VERSION_MINOR="13" check_python_version () { diff --git a/src/bindings/js/docs/CODESTYLE.md b/src/bindings/js/docs/CODESTYLE.md index 0ebfd322767b57..2441663d6cc424 100644 --- a/src/bindings/js/docs/CODESTYLE.md +++ b/src/bindings/js/docs/CODESTYLE.md @@ -1,9 +1,14 @@ # Code Style Guide +Node.js bindings contain two parts: C++ and Typescript/JavaScript. + This article presents the coding standards for JavaScript and TypeScript parts of **openvino-node** package. The following rules will help maintain code quality and consistency throughout the codebase. +For C++ codestyle rules, refer to [this document](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/coding_style.md). + Make sure your IDE has ESLint plugin installed. Its rules are specified in the [.eslint-global.js file](../.eslintrc-global.js). Keep in mind that your PR will not be approved if it does not meet the following requirements. + ## General Rules ### 1. Semicolons @@ -89,6 +94,7 @@ Make sure your IDE has ESLint plugin installed. Its rules are specified in the [ - Special case for the `catch` keyword: No space after `catch` - **Enforced By**: `keyword-spacing: ['error', { overrides: { catch: { after: false } } }]` + ## Additional Resources For further details on each rule, refer to the [ESLint documentation](https://eslint.org/docs/rules/). diff --git a/src/bindings/js/docs/README.md b/src/bindings/js/docs/README.md index bada676878847f..f0c70cf4dd9aed 100644 --- a/src/bindings/js/docs/README.md +++ b/src/bindings/js/docs/README.md @@ -2,10 +2,10 @@ ## Folders -- `./docs` - documentation -- `./node` - openvino-node npm package +- [./docs](../docs/) - documentation +- [./node](../node/) - openvino-node npm package -## openvino-node Package Developer Documentation +## `openvino-node` Package Developer Documentation ### Components @@ -28,7 +28,6 @@ ```bash cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DENABLE_FASTER_BUILD=ON \ -DCPACK_GENERATOR=NPM \ -DENABLE_SYSTEM_TBB=OFF -UTBB* \ -DENABLE_TESTS=OFF \ @@ -75,9 +74,9 @@ [OpenVINO™ Node.js Bindings Examples of Usage](../../../../samples/js/node/README.md) -## Contribution +## Contributing -If you want to contribute to the project, refer to the [code style rules](./CODESTYLE.md) and [contribution guide](../../../../CONTRIBUTING.md) first. +Your contributions are welcome! Make sure to read the [Contribution Guide](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/node/CONTRIBUTING.md) to learn how you can get involved. ## See Also diff --git a/src/bindings/js/docs/code_examples.md b/src/bindings/js/docs/code_examples.md index 13bfa14812d54b..08d92e7307dbfe 100644 --- a/src/bindings/js/docs/code_examples.md +++ b/src/bindings/js/docs/code_examples.md @@ -1,22 +1,24 @@ # How to extend the OpenVINO™ JavaScript API code -## Build the OpenVINO™ JavaScript API +## Build the OpenVINO™ JavaScript API + For detailed build instructions, refer to the [OpenVINO™ JavaScript API documentation](./README.md). + ## Project's naming conventions + When implementing the C++ sources for the JavaScript API, it is essential to adhere to the OpenVINO naming conventions described in the [OpenVINO Coding Style Guide](../../../../docs/dev/coding_style.md). In summary, the naming style employs `Snake Case` for methods, functions, and variables, while `Camel Case` is used for class names. Additionally, the naming of entities in the C++ sources should closely mirror their equivalents in the C++ API to maintain consistency. For methods that are exposed to JavaScript, the naming convention transitions to `Camel Case`, aligning with common JavaScript practices. As an example, a method in the C++ API named `get_element_type` would be represented in the JavaScript API as `getElementType()`. + ## node-addon-api module [node addon api](https://github.com/nodejs/node-addon-api) is used to create OpenVINO JavaScript API for Node.js. The quickest way to learn is to follow the official [examples](https://github.com/nodejs/node-addon-examples). It is recommended to check out the tutorial on [how to create a JavaScript object from a C++ object](https://github.com/nodejs/node-addon-examples/tree/main/src/2-js-to-native-conversion/object-wrap-demo/node-addon-api). - - - ## Adding a new class and method + To introduce a new `MyTensor` class that interacts with the `ov::Tensor` class, follow these steps: - The class should facilitate construction from an ov::Tensor instance and allow initialization from a JavaScript element type and shape. - It should also provide a getElementType method that retrieves the ov::Tensor element type. @@ -25,7 +27,7 @@ Begin by creating a header file for the `MyTensor` class in the OpenVINO reposit ```cpp class MyTensor : public Napi::ObjectWrap { public: - // Constructor for the wrapper class + // Constructor for the wrapper class MyTensor(const Napi::CallbackInfo& info); // It returns a JavaScript class definition @@ -75,12 +77,15 @@ add_library(${PROJECT_NAME} SHARED ) ``` + ### Argument validation and conversion When binding JavaScript arguments with C++ functions, it is crucial to validate and convert the arguments appropriately. The template `ov::js::validate` function is a utility that facilitates this process. It is particularly useful for handling different overloads of functions and ensuring standardized error messages when arguments do not match expected signatures. Before implementing a new conversion function, such as `js_to_cpp`, review the existing [helper methods](../../node/include/helper.hpp) to see if one already meets your requirements. + ### New class initialization + When a new class is introduced to the `openvino-node` module, it must be initialized upon module loading. This is done in the [addon.cpp](../../src/addon.cpp) file. The initialization process registers the class with the Node.js environment so that it can be used within JavaScript code. ```cpp Napi::Object init_module(Napi::Env env, Napi::Object exports) { @@ -100,6 +105,7 @@ struct AddonData { ``` ### Document the new functionality + The last step is to add the TypeScript type definitions and describe the new functionality. ```typescript /** @@ -132,9 +138,9 @@ export interface NodeAddon { Now that coding is finished, remember to rebuild the project and test it out. -To learn how to test your code, refer to the guide on [how to test OpenVINO™ JavaScript API.](./test_examples.md) +To learn how to test your code, refer to the guide on [how to test OpenVINO™ JavaScript API.](./test_examples.md) ## See also * [OpenVINO™ README](../../../../README.md) * [OpenVINO™ bindings README](../../README.md) - * [Developer documentation](../../../../docs/dev/index.md) \ No newline at end of file + * [Developer documentation](../../../../docs/dev/index.md) diff --git a/src/bindings/js/docs/test_examples.md b/src/bindings/js/docs/test_examples.md index b8ff0c8ff7c9d0..0e75cb56f3a700 100644 --- a/src/bindings/js/docs/test_examples.md +++ b/src/bindings/js/docs/test_examples.md @@ -1,6 +1,6 @@ # How to test the OpenVINO™ JavaScript API -## Build the OpenVINO™ JavaScript API +## Build the OpenVINO™ JavaScript API For detailed build instructions, refer to the [OpenVINO™ JavaScript API documentation](./README.md). @@ -17,14 +17,14 @@ npm run test To run specific test files, you can pass one or more glob patterns: ```shell -node --test "tests/unit/core.test.js" "tests/unit/*model.test.js" +node --test "tests/unit/core.test.js" "tests/unit/*model.test.js" ``` Before executing individual test files, a one-time setup is required. If you have not previously executed `npm run test`, initiate the setup by running the following command: ```shell npm run test_setup -``` +``` More information on running tests from the command line can be found in the [Node.js documentation]( https://nodejs.org/docs/latest/api/test.html#running-tests-from-the-command-line). @@ -45,11 +45,11 @@ It is recommended to run the code style check each time new tests are added. ## Writing OpenVINO™ JavaScript API tests + ### Before start Follow and complete [Examples of OpenVINO™ JavaScript API code](./code_examples.md). - ### Adding new test-case in the correct place Each new test should verify the correct behavior of the new functionality (e.g. class, method). @@ -57,7 +57,8 @@ Unit test files are located in the `/src/bindings/js/node/tests/u Always add tests to the correct locations and create new files only when necessary. *Remember to include the license on top of each new file*. -### Test writing guidelines + +### Test writing guidelines Each test file starts with a `describe` block to group all tests related to a specific class or module. The name of the `describe` block should match the name of the class or module being tested, for example *ov.Core tests*. Within the `describe` block, individual tests are defined using `test` or `it` blocks, with the name of the test reflecting what is being tested. If multiple tests relate to the same method, they can be grouped within a nested `describe` block. diff --git a/src/bindings/js/node/CONTRIBUTING.md b/src/bindings/js/node/CONTRIBUTING.md new file mode 100644 index 00000000000000..aacef418aeed2d --- /dev/null +++ b/src/bindings/js/node/CONTRIBUTING.md @@ -0,0 +1,67 @@ +# Contributing to OpenVINO™ Node.js API + +Your commitment to this project is greatly appreciated and the following guide is intended to help you contribute. + +Make sure to read [main contribution guide](https://github.com/openvinotoolkit/openvino/blob/master/CONTRIBUTING.md) first. It covers most topics related to contributing to OpenVINO. + + +## TLDR + +1. Decide what you want to change. +2. Create your fork of the OpenVINO repository. +3. Create a branch with a meaningful name for your changes. +4. Align the code style, commit the changes, and run tests. +5. Create a Pull Request, which clearly describes what has been changed and why. +6. Go through the Code Review. +7. Get your awesome code merged! + +Read the section below for more details. + + +## How to Decide What to Change + +In case of minor fixes, like changing variable names, additional parameter checks, etc., go to the next step. + +However, if you want to bring significant changes, for example, the extension of architecture or a big part of functionality, that involves a large amount +of source code, open [an issue](https://github.com/openvinotoolkit/openvino/issues/new?assignees=octocat&labels=enhancement%2Cfeature&projects=&template=feature_request.yml&title=%5BFeature+Request%5D%3A+) first and discuss your idea with +codeowners. It will prevent you from doing extra work. + +You can also take one of the well-described tasks from the [Good First Issue](https://github.com/orgs/openvinotoolkit/projects/3/views/14) section. It can be a great start to contributing with codeowners' support! + + +## Let's code + +Get familiar with Node.js API architecture and code samples. +Refer to the [guide](../docs/code_examples.md), which will help you understand the component structure and the code style. + +The environment setup and build instructions can be found in [Building the Node.js API](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/docs/README.md#openvino-node-package-developer-documentation). + +Run tests! If you add a new functionality, make sure that it is covered by tests first. +Read [the guide](../docs/test_examples.md) for more details about the tests and their runs. +Many CI checks will run after getting a Code Review. Make sure that +all checks have passed. CI checks are composed of both functional tests and code-style checks and may fail because of warnings/errors in both stages. + +Remember to follow [our codestyle](../docs/CODESTYLE.md). +By following the provided guide and using an automotive code style checking tool, like +**eslint** and **clang-format-9**, you will save some time and help with the code review of proposed changes. + + +## Description of the Pull Request + +Append all PR titles with the `[OV JS]` tag. Provide any relevant details in the description, as it will definitely help with the review. The minimum requirement is a compact, bulleted list of proposed changes. + +Use the following template: +``` +*Describe what is the purpose of this PR* + +### Details: +- *Describe your changes.* +- ... + +``` + + +## License + +By contributing to the OpenVINO project, you agree that your contributions will be +licensed under the terms of the [LICENSE](https://github.com/openvinotoolkit/openvino/blob/master/LICENSE). diff --git a/src/bindings/js/node/README.md b/src/bindings/js/node/README.md index e2c38f2a18e516..c927bd0b360ed4 100644 --- a/src/bindings/js/node/README.md +++ b/src/bindings/js/node/README.md @@ -1,8 +1,14 @@ # OpenVINO™ Node.js Bindings -Use OpenVINO JavaScript API for your Node.js application. +Use OpenVINO to deploy deep learning models easily in Node.js applications. -## Usage +## Introduction + +OpenVINO™ is an open-source toolkit designed for high-performance deep learning inference. +Node.js API provides bindings to subset APIs from OpenVINO Runtime. +The Node.js bindings enable JavaScript developers to use the capabilities of OpenVINO in their applications. + +## Quick Start Install the **openvino-node** package: ```bash @@ -14,15 +20,21 @@ Use the **openvino-node** package: const { addon: ov } = require('openvino-node'); ``` +Refer to the complete description of the `addon` API in the [documentation](https://docs.openvino.ai/2024/api/nodejs_api/addon.html). + +See the [samples](https://github.com/openvinotoolkit/openvino/blob/master/samples/js/node/README.md) for more details on how to use it. + ## Usage in Electron applications To use the package in development of Electron applications on Windows, make sure that **Desktop development with C++** component from [Build Tools for Visual Studio](https://aka.ms/vs/17/release/vs_BuildTools.exe) is installed. -## Build From Sources +## Supported Platforms -For more details, refer to the [OpenVINO™ JavaScript API Developer Documentation](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/docs/README.md#openvino-node-package-developer-documentation) +- Windows x86 +- Linux x86/ARM +- MacOS x86/ARM ## Documentation & Samples @@ -31,11 +43,19 @@ For more details, refer to the [OpenVINO™ JavaScript API Developer Documentati ## Live Sample -You can run this sample in the browser; no installation is required. +You can run the following sample in the browser, no installation is required. [Codesandbox](https://codesandbox.io/) is a free online service with limited resources. For optimal performance and more control, it is recommended to run the sample locally. - [hello-classification-sample](https://codesandbox.io/p/devbox/openvino-node-hello-classification-sample-djl893) +## Build From Sources + +For more details, refer to the [OpenVINO™ JavaScript API Developer Documentation](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/docs/README.md#openvino-node-package-developer-documentation) + +## Contributing + +Contributions are always welcome! Read the [Contribution Guide](https://github.com/openvinotoolkit/openvino/blob/master/src/bindings/js/node/CONTRIBUTING.md) to learn how you can get involved. + ## See Also * [OpenVINO™ README](https://github.com/openvinotoolkit/openvino/blob/master/README.md) diff --git a/src/bindings/js/node/package.json b/src/bindings/js/node/package.json index d00633c93b062a..8bc6bbd4bb1d46 100644 --- a/src/bindings/js/node/package.json +++ b/src/bindings/js/node/package.json @@ -48,5 +48,8 @@ "remote_path": "./repositories/openvino/nodejs_bindings/{version}/{platform}/", "package_name": "openvino_nodejs_bindings_{platform}_{version}_{arch}.tar.gz", "host": "https://storage.openvinotoolkit.org" - } + }, + "keywords": [ + "OpenVINO" + ] } diff --git a/src/bindings/python/src/openvino/preprocess/torchvision/requirements.txt b/src/bindings/python/src/openvino/preprocess/torchvision/requirements.txt index 23ba17d4918e71..201d5085bd1583 100644 --- a/src/bindings/python/src/openvino/preprocess/torchvision/requirements.txt +++ b/src/bindings/python/src/openvino/preprocess/torchvision/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cpu torch>=1.13 -torchvision; platform_machine == 'arm64' and python_version >= '3.8' +torchvision; platform_machine == 'arm64' and python_version >= '3.9' torchvision; platform_machine != 'arm64' pillow>=9.0 \ No newline at end of file diff --git a/src/common/transformations/include/ov_ops/rotary_positional_embeddings.hpp b/src/common/transformations/include/ov_ops/rotary_positional_embeddings.hpp index 5d839c19600340..dcb9aef187d2d9 100644 --- a/src/common/transformations/include/ov_ops/rotary_positional_embeddings.hpp +++ b/src/common/transformations/include/ov_ops/rotary_positional_embeddings.hpp @@ -27,6 +27,8 @@ class TRANSFORMATIONS_API RoPE : public Op { bool is_interleaved = false; // interleaved mode, implies trans0213 happens after RoPE size_t rotary_ndims = 0; // dimensions to be embedded (d in the description) bool is_chatglm = false; // chatglm is special which overrides other setting + bool support_2d_rope = false; // 2d rope mode, Support 2 dimentional rope which is independant of batch and + // each head. change input order to [batch, head_cnt, 4608] to support 2d rope bool is_qwen = false; // Qwen is special which overrides other setting size_t head_cnt = 0; size_t head_size = 0; diff --git a/src/common/transformations/include/transformations/common_optimizations/fuse_rotary_positional_embeddings.hpp b/src/common/transformations/include/transformations/common_optimizations/fuse_rotary_positional_embeddings.hpp index 5cd99f88d13413..eb1c92bcf9607f 100644 --- a/src/common/transformations/include/transformations/common_optimizations/fuse_rotary_positional_embeddings.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/fuse_rotary_positional_embeddings.hpp @@ -38,7 +38,7 @@ class ov::pass::RoPEFusionGPTJ : public ov::pass::MatcherPass { class ov::pass::RoPEFusionChatGLM : public ov::pass::MatcherPass { public: OPENVINO_RTTI("RoPEFusionChatGLM", "0"); - RoPEFusionChatGLM(int split_output_id); + RoPEFusionChatGLM(int split_output_id, const bool support_2d_rope = false); }; class ov::pass::RoPEFusionQwen : public ov::pass::MatcherPass { @@ -84,7 +84,7 @@ class ov::pass::RoPEShareCosSin : public ov::pass::MatcherPass { class ov::pass::RoPEFusion : public ov::pass::GraphRewrite { public: OPENVINO_RTTI("RoPEFusion", "0"); - RoPEFusion() { + RoPEFusion(bool support_2d_rope = false) { add_matcher(); add_matcher(); // optional heads & tails are fused in separate matcher pass, @@ -95,6 +95,10 @@ class ov::pass::RoPEFusion : public ov::pass::GraphRewrite { add_matcher(0); add_matcher(1); + if (support_2d_rope) { + add_matcher(0, true); + add_matcher(1, true); + } add_matcher(0); add_matcher(1); diff --git a/src/common/transformations/src/ov_ops/rotary_positional_embeddings.cpp b/src/common/transformations/src/ov_ops/rotary_positional_embeddings.cpp index 915adecda0af68..3e75e2b88df266 100644 --- a/src/common/transformations/src/ov_ops/rotary_positional_embeddings.cpp +++ b/src/common/transformations/src/ov_ops/rotary_positional_embeddings.cpp @@ -45,13 +45,27 @@ void RoPE::validate_and_infer_types() { } if (m_config.is_chatglm) { - // chatGLM specific RoPE - // input [length, batch_size, (hidden_states_q + hidden_states_k + hidden_states_v)] - // output [length, batch_size, head_cnt, hidden_states_k] - set_output_type( - 0, - get_input_element_type(0), - {input_pshape[0], input_pshape[1], ov::Dimension(m_config.head_cnt), ov::Dimension(m_config.head_size)}); + if (m_config.support_2d_rope) { + // chatGLM specific RoPE + // input [batch_size, length, (hidden_states_q + hidden_states_k + hidden_states_v)] + // output [batch_size, head_cnt, length, hidden_states_k] + set_output_type(0, + get_input_element_type(0), + {input_pshape[0], + ov::Dimension(m_config.head_cnt), + input_pshape[1], + ov::Dimension(m_config.head_size)}); + } else { + // chatGLM specific RoPE + // input [length, batch_size, (hidden_states_q + hidden_states_k + hidden_states_v)] + // output [length, batch_size, head_cnt, hidden_states_k] + set_output_type(0, + get_input_element_type(0), + {input_pshape[0], + input_pshape[1], + ov::Dimension(m_config.head_cnt), + ov::Dimension(m_config.head_size)}); + } return; } @@ -79,6 +93,7 @@ bool RoPE::visit_attributes(ov::AttributeVisitor& visitor) { visitor.on_attribute("is_interleaved", m_config.is_interleaved); visitor.on_attribute("rotary_ndims", m_config.rotary_ndims); visitor.on_attribute("is_chatglm", m_config.is_chatglm); + visitor.on_attribute("support_2d_rope", m_config.support_2d_rope); visitor.on_attribute("is_qwen", m_config.is_qwen); visitor.on_attribute("head_cnt", m_config.head_cnt); visitor.on_attribute("head_size", m_config.head_size); diff --git a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp index b6c19a0a0391fd..143603f0415373 100644 --- a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp @@ -417,12 +417,16 @@ ov::pass::RoPEFusionGPTJ::RoPEFusionGPTJ() { this->register_matcher(m, callback); } -ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id) { +ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id, const bool support_2d_rope) { MATCHER_SCOPE(RoPEFusionChatGLM); - auto qkv_linear = makePattern("[?,?,?]"); // [seq_length, batch_size, 4608] + // [seq_length, batch_size, input_size(will be cropped to match hidden state size)] + // [batch_size, seq_length, input_size] support_2d_rope + auto qkv_linear = makePattern("[?,?,?]"); auto seq_length = makePattern("i32[1]"); - auto cos_sin_cache = makePattern("[?,?,?,?]"); // [max_pos_embeddings, batch_size, 32, 2] + // [max_pos_embeddings, batch_size, half_rotary_dims, 2] + // [batch_size, max_pos_embeddings, half_rotary_dims, 2] support_2d_rope + auto cos_sin_cache = makePattern("[?,?,?,?]"); auto ndims = ov::gen_pattern::Symbol("ndims"); auto head_cnt = ov::gen_pattern::Symbol("head_cnt"); @@ -436,37 +440,76 @@ ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id) { auto qkv_proj = makePattern({qkv_linear, -1, {total_size_q, total_size_k, total_size_v}}); qkv_proj->set_output_size(3); - // get key [L, B, Hkv, S] auto cur_key = makePattern({qkv_proj->output(split_output_id), {0, 0, head_cnt, head_size}}, {{"special_zero", true}}); - auto slice_Slice_437 = GenSlice(cur_key, 0, ndims, 1, 3); - auto var_split_1 = makePattern({cur_key, 3, {ndims, ov::gen_pattern::Symbol("end")}}); + std::shared_ptr input_key = nullptr; + // Extended the RoPE to a two-dimensional form to accommodate the 2D positional encoding in GLM. + // Calculate positional embedding independent of batch and each head + if (support_2d_rope) { + // Get transposed key [batch, head_cnt, seq_length, head_size] + input_key = makePattern({cur_key, {0, 2, 1, 3}}); + } else { + // Get key [seq_length, batch, head_cnt, head_size] + input_key = std::move(cur_key); + } + + auto slice_Slice_437 = GenSlice(input_key, 0, ndims, 1, 3); + auto var_split_1 = makePattern({input_key, 3, {ndims, ov::gen_pattern::Symbol("end")}}); var_split_1->set_output_size(2); // rotate half - auto ListConstruct_452_Concat = - makePattern({seq_length, {-1}, {head_cnt}, {ndims / 2}, {2}}, {{"axis", 0}}); - auto const_target_shape_1 = makeConst({seq_len, batch, head_cnt, ndims / 2, 2}); - - auto ListConstruct_379_Concat = - makePattern({seq_length, {-1}, {1}, {ndims / 2}, {2}}, {{"axis", 0}}); - auto const_target_shape_2 = makeConst({seq_len, batch, 1, ndims / 2, 2}); - - auto reshape_Reshape_453 = makePattern( - {slice_Slice_437 | var_split_1->output(0), ListConstruct_452_Concat | const_target_shape_1}); + std::shared_ptr reshape_Reshape_453 = nullptr; + if (support_2d_rope) { + auto const_target_shape_1 = makeConst({0, head_cnt, 0, ndims / 2, 2}); + reshape_Reshape_453 = + makePattern({slice_Slice_437 | var_split_1->output(0), const_target_shape_1}, + {{"special_zero", true}}); + } else { + auto ListConstruct_452_Concat = + makePattern({seq_length, {-1}, {head_cnt}, {ndims / 2}, {2}}, {{"axis", 0}}); + auto const_target_shape_1 = makeConst({seq_len, batch, head_cnt, ndims / 2, 2}); + reshape_Reshape_453 = makePattern( + {slice_Slice_437 | var_split_1->output(0), ListConstruct_452_Concat | const_target_shape_1}); + } auto x_even = makePattern({reshape_Reshape_453, 0, -1}, {{"batch_dims", 0}}); auto x_odd = makePattern({reshape_Reshape_453, 1, -1}, {{"batch_dims", 0}}); - auto slice_Slice_449 = makePattern({cos_sin_cache, {0}, seq_length, {1}, {0}}); - auto slice_StridedSlice_449 = GenStridedSlice(cos_sin_cache, {0}, seq_length, {1}, 0); + auto var_split_2 = makePattern({cos_sin_cache, 0, {0, ov::gen_pattern::Symbol("end")}}); var_split_2->set_output_size(2); - auto view_Reshape_460 = - makePattern({slice_StridedSlice_449 | slice_Slice_449 | var_split_2->output(0), - ListConstruct_379_Concat | const_target_shape_2}, - {{"special_zero", false}}); + std::shared_ptr view_Reshape_460 = nullptr; + if (support_2d_rope) { + auto ListConstruct_379_Concat = + makePattern({{-1}, {1}, seq_length, {ndims / 2}, {2}}, {{"axis", 0}}); + auto const_target_shape_2 = makeConst({batch, 1, seq_len, ndims / 2, 2}); + + // Slice cos_sin_cache to support 2-dimentional RoPE + auto ScatterUpdate = makePattern({{0, 0}, {1}, seq_length, {0}}, {}); + auto slice_Slice_449_1d = makePattern({cos_sin_cache, {0}, seq_length, {1}, {1}}); + auto slice_Slice_449_2d = makePattern({cos_sin_cache, {0, 0}, ScatterUpdate, {1, 1}, {0}}); + auto slice_StridedSlice_449 = GenStridedSlice(cos_sin_cache, {0, 0}, ScatterUpdate, {1, 1}, 1); + + // [batch, 1, seq_length, half_rotary_dims, 2] + view_Reshape_460 = makePattern( + {slice_StridedSlice_449 | slice_Slice_449_1d | slice_Slice_449_2d | var_split_2->output(0), + ListConstruct_379_Concat | const_target_shape_2}, + {{"special_zero", false}}); + } else { + auto ListConstruct_379_Concat = + makePattern({seq_length, {-1}, {1}, {ndims / 2}, {2}}, {{"axis", 0}}); + auto const_target_shape_2 = makeConst({seq_len, batch, 1, ndims / 2, 2}); + + auto slice_Slice_449 = makePattern({cos_sin_cache, {0}, seq_length, {1}, {0}}); + auto slice_StridedSlice_449 = GenStridedSlice(cos_sin_cache, {0}, seq_length, {1}, 0); + + // [seq_length, 1, batch, half_rotary_dims, 2] + view_Reshape_460 = + makePattern({slice_StridedSlice_449 | slice_Slice_449 | var_split_2->output(0), + ListConstruct_379_Concat | const_target_shape_2}, + {{"special_zero", false}}); + } auto cos_tab = makePattern({view_Reshape_460, 0, -1}, {{"batch_dims", 0}}); auto x_even_cos = makePattern({x_even, cos_tab}, {{"auto_broadcast", "numpy"}}); @@ -487,11 +530,21 @@ ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id) { auto ShapeOf_135133 = makePattern({stack_481}); auto flatten_Slice_497 = GenSlice(ShapeOf_135133, 0, 3, 1, 0); auto flatten_Concat_500 = makePattern({flatten_Slice_497, {-1}}, {{"axis", 0}}); - auto const_target_shape_3 = makeConst({seq_len, batch, head_cnt, ndims}); - // [length, batch, head_cnt, half_rotary_dims, 2] - auto flatten_Reshape_501 = - makePattern({stack_481, flatten_Concat_500 | const_target_shape_3}, {{"special_zero", true}}); - auto slice_Slice_443 = GenSlice(cur_key, ndims, INT_MAX, 1, 3); + + std::shared_ptr const_target_shape_3 = nullptr; + std::shared_ptr flatten_Reshape_501 = nullptr; + if (support_2d_rope) { + // [batch, head_cnt, length, half_rotary_dims, 2] + const_target_shape_3 = makeConst({batch, head_cnt, seq_len, ndims}); + flatten_Reshape_501 = makePattern({stack_481, flatten_Concat_500 | const_target_shape_3}, + {{"special_zero", true}}); + } else { + // [length, batch, head_cnt, half_rotary_dims, 2] + const_target_shape_3 = makeConst({seq_len, batch, head_cnt, ndims}); + flatten_Reshape_501 = makePattern({stack_481, flatten_Concat_500 | const_target_shape_3}, + {{"special_zero", true}}); + } + auto slice_Slice_443 = GenSlice(input_key, ndims, INT_MAX, 1, 3); auto cat_Concat_505 = makePattern({flatten_Reshape_501, slice_Slice_443 | var_split_1->output(1)}, {{"axis", -1}}); @@ -510,6 +563,7 @@ ov::pass::RoPEFusionChatGLM::RoPEFusionChatGLM(int split_output_id) { OutputVector new_args; config.rotary_ndims = static_cast(validator["ndims"]); config.is_chatglm = true; + config.support_2d_rope = support_2d_rope; config.head_cnt = static_cast(validator["head_cnt"]); config.head_size = static_cast(validator["head_size"]); diff --git a/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp index 5b54b4a7cce437..6eb0add525c815 100644 --- a/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp +++ b/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp @@ -135,6 +135,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_LLama2_no_gather) { {"config.input_trans0213", true}, {"config.is_interleaved", false}, {"config.is_chatglm", false}, + {"config.support_2d_rope", false}, {"config.is_qwen", false}, {"config.head_cnt", 0}, {"config.head_size", 0}, @@ -170,6 +171,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_LLama2_with_gather) { {"config.input_trans0213", true}, {"config.is_interleaved", false}, {"config.is_chatglm", false}, + {"config.support_2d_rope", false}, {"config.is_qwen", false}, {"config.head_cnt", 0}, {"config.head_size", 0}, @@ -308,6 +310,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTNEOX_no_gather) { {"config.input_trans0213", true}, {"config.is_interleaved", false}, {"config.is_chatglm", false}, + {"config.support_2d_rope", false}, {"config.is_qwen", false}, {"config.head_cnt", 0}, {"config.head_size", 0}, @@ -342,6 +345,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTNEOX_with_gather) { {"config.input_trans0213", true}, {"config.is_interleaved", false}, {"config.is_chatglm", false}, + {"config.support_2d_rope", false}, {"config.is_qwen", false}, {"config.head_cnt", 0}, {"config.head_size", 0}, @@ -457,6 +461,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTJ) { {"config.input_trans0213", false}, {"config.is_interleaved", true}, {"config.is_chatglm", false}, + {"config.support_2d_rope", false}, {"config.is_qwen", false}, {"config.head_cnt", 0}, {"config.head_size", 0}, @@ -566,6 +571,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_chatGML) { {"config.is_interleaved", false}, {"config.rotary_ndims", rotary_ndims}, {"config.is_chatglm", true}, + {"config.support_2d_rope", false}, {"config.is_qwen", false}, {"config.head_cnt", num_heads}, {"config.head_size", ndims}, @@ -643,6 +649,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_chatGML_Slice) { {"config.is_interleaved", false}, {"config.rotary_ndims", rotary_ndims}, {"config.is_chatglm", true}, + {"config.support_2d_rope", false}, {"config.is_qwen", false}, {"config.head_cnt", num_heads}, {"config.head_size", ndims}, @@ -723,6 +730,7 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTJ_Slice) { {"config.input_trans0213", false}, {"config.is_interleaved", true}, {"config.is_chatglm", false}, + {"config.support_2d_rope", false}, {"config.is_qwen", false}, {"config.head_cnt", 0}, {"config.head_size", 0}, @@ -730,4 +738,120 @@ TEST_F(TransformationTestsF, ConvertToROPE_GPTJ_Slice) { {"config.gather_position_arg_id", 0}}); model_ref = std::make_shared(ov::NodeVector{rope}, ov::ParameterVector{input, cos_sin}); } +} + +TEST_F(TransformationTestsF, ConvertToROPE_chatGML_2d_rope) { + disable_rt_info_check(); + const int batch = 2; + const int seq_len = 7; + const int num_heads = 32; + const int ndims = 128; + const int rotary_ndims = 64; + const int max_pos_length = 2048; + { + auto input = std::make_shared(ov::element::f32, ov::PartialShape{batch, seq_len, 4608}); + auto cos_sin_cache = + std::make_shared(ov::element::f32, + ov::PartialShape{max_pos_length, (rotary_ndims / 2), 2}); + auto position_ids = std::make_shared(ov::element::i32, ov::PartialShape{batch, seq_len}); + + auto __module_transformer_index_67_Gather = + makeOP({cos_sin_cache, position_ids, 0}, {{"batch_dims", 0}}); + + auto ListUnpack_321 = makeOP({input, -1, {4096, 256, 256}}); + auto view_Reshape = makeOP({ListUnpack_321->output(0), {0, 0, num_heads, ndims}}, + {{"special_zero", true}}); + + auto permute_Transpose = makeOP({view_Reshape, {0, 2, 1, 3}}, {}); + + auto slice_Slice_357 = + makeOP({permute_Transpose, {0, 0, 0, 0}, {0, 0, 0, rotary_ndims}, {1, 1, 1, 1}}, + {{"begin_mask", {1, 1, 1, 0}}, + {"end_mask", {1, 1, 1, 0}}, + {"new_axis_mask", {}}, + {"shrink_axis_mask", {}}, + {"ellipsis_mask", {}}}); + + auto aten_view_Reshape_1 = + makeOP({ListUnpack_321->output(1), {0, 0, 2, ndims}}, {{"special_zero", true}}); + auto aten_transpose_1 = makeOP({aten_view_Reshape_1, {0, 2, 1, 3}}); + auto shape_of_105249 = makeOP({aten_transpose_1}, {{"output_type", "i32"}}); + auto gather_105252 = makeOP({shape_of_105249, {2}, {0}}, {{"batch_dims", 0}}); + auto scatter_update_63441 = makeOP({{0, 0}, {1}, gather_105252, {0}}); + // connected to cos_sin_cache + auto slice_Slice_369 = makeOP( + {__module_transformer_index_67_Gather, {0, 0}, scatter_update_63441, {1, 1}}, + {{"begin_mask", {1, 0}}, + {"end_mask", {1, 0}}, + {"new_axis_mask", {}}, + {"shrink_axis_mask", {}}, + {"ellipsis_mask", {}}}); + auto list_construct_concat_1 = + makeOP({{-1}, {1}, gather_105252, {rotary_ndims / 2}, {2}}, {{"axis", 0}}); + + auto reshape_Reshape_373 = + makeOP({slice_Slice_357, {0, 32, 0, 32, 2}}, {{"special_zero", true}}); + auto select_Gather_384 = + makeOP({reshape_Reshape_373, 0, -1}, {{"batch_dims", 0}}); // x_even + auto select_Gather_381 = + makeOP({reshape_Reshape_373, 1, -1}, {{"batch_dims", 0}}); // x_odd + auto view_Reshape_380 = + makeOP({slice_Slice_369, list_construct_concat_1}, {{"special_zero", false}}); + auto select_Gather_385 = makeOP({view_Reshape_380, 0, -1}, {{"batch_dims", 0}}); // cos_tab + auto select_Gather_382 = makeOP({view_Reshape_380, 1, -1}, {{"batch_dims", 0}}); // sin_tab + + auto mul_Multiply_386 = makeOP({select_Gather_381, select_Gather_382}, + {{"auto_broadcast", "numpy"}}); // x_odd_sin + auto mul_Multiply_383 = makeOP({select_Gather_384, select_Gather_385}, + {{"auto_broadcast", "numpy"}}); // x_even_cos + auto Multiply_101315 = + makeOP({mul_Multiply_386, -1.000000f}, {{"auto_broadcast", "numpy"}}); + auto sub_Subtract_389 = + makeOP({mul_Multiply_383, Multiply_101315}, {{"auto_broadcast", "numpy"}}); + + auto mul_Multiply_391 = makeOP({select_Gather_381, select_Gather_385}, + {{"auto_broadcast", "numpy"}}); // x_odd_cos + auto mul_Multiply_393 = makeOP({select_Gather_384, select_Gather_382}, + {{"auto_broadcast", "numpy"}}); // x_even_sin + auto add_Add_396 = makeOP({mul_Multiply_391, mul_Multiply_393}, {{"auto_broadcast", "numpy"}}); + + auto Unsqueeze_62716 = makeOP({sub_Subtract_389, -1}, {}); + auto Unsqueeze_62717 = makeOP({add_Add_396, -1}, {}); + + auto stack_401 = makeOP({Unsqueeze_62716, Unsqueeze_62717}, {{"axis", -1}}); + auto flatten_Reshape_421 = + makeOP({stack_401, {0, num_heads, 0, rotary_ndims}}, {{"special_zero", true}}); + auto slice_Slice_363 = makeOP( + {permute_Transpose, {0, 0, 0, rotary_ndims}, {0, 0, 0, INT_MAX}, {1, 1, 1, 1}}, + {{"begin_mask", {1, 1, 1, 0}}, + {"end_mask", {1, 1, 1, 0}}, + {"new_axis_mask", {}}, + {"shrink_axis_mask", {}}, + {"ellipsis_mask", {}}}); + auto cat_Concat_425 = makeOP({flatten_Reshape_421, slice_Slice_363}, {{"axis", -1}}); + model = std::make_shared(ov::NodeVector{cat_Concat_425}, + ov::ParameterVector{input, cos_sin_cache, position_ids}); + } + manager.register_pass(true); + { + auto input = std::make_shared(ov::element::f32, ov::Shape{batch, seq_len, 4608}); + auto cos_sin_cache = + std::make_shared(ov::element::f32, ov::Shape{max_pos_length, (rotary_ndims / 2), 2}); + auto position_ids = std::make_shared(ov::element::i32, ov::PartialShape{batch, seq_len}); + auto gather_cos_sin = makeOP({cos_sin_cache, position_ids, 0}, {{"batch_dims", 0}}); + auto rope = makeOP({input, gather_cos_sin, gather_cos_sin}, + {{"config.slice_start", 0}, + {"config.slice_stop", 4096}, + {"config.input_trans0213", false}, + {"config.is_interleaved", false}, + {"config.rotary_ndims", rotary_ndims}, + {"config.is_chatglm", true}, + {"config.support_2d_rope", true}, + {"config.is_qwen", false}, + {"config.head_cnt", num_heads}, + {"config.head_size", ndims}, + {"config.gather_position_arg_id", 0}}); + model_ref = + std::make_shared(ov::NodeVector{rope}, ov::ParameterVector{input, cos_sin_cache, position_ids}); + } } \ No newline at end of file diff --git a/src/common/util/include/openvino/util/mmap_object.hpp b/src/common/util/include/openvino/util/mmap_object.hpp index 364e1eed4ca712..3aba8e69c094a1 100644 --- a/src/common/util/include/openvino/util/mmap_object.hpp +++ b/src/common/util/include/openvino/util/mmap_object.hpp @@ -50,18 +50,4 @@ std::shared_ptr load_mmap_object(const std::string& path); std::shared_ptr load_mmap_object(const std::wstring& path); #endif // OPENVINO_ENABLE_UNICODE_PATH_SUPPORT - -class MmapStream final : public std::ifstream { -public: - MmapStream(const std::string& path) : std::ifstream(path, std::ios_base::binary) { - m_memory = ov::load_mmap_object(path); - } - -#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT - MmapStream(const std::wstring& path); -#endif // OPENVINO_ENABLE_UNICODE_PATH_SUPPORT - - std::shared_ptr m_memory; -}; - } // namespace ov diff --git a/src/common/util/src/os/win/win_mmap_object.cpp b/src/common/util/src/os/win/win_mmap_object.cpp index 0b14d7ac774700..6f2515124273f1 100644 --- a/src/common/util/src/os/win/win_mmap_object.cpp +++ b/src/common/util/src/os/win/win_mmap_object.cpp @@ -141,11 +141,6 @@ std::shared_ptr load_mmap_object(const std::wstring& path) { holder->set(path); return holder; } - -MmapStream::MmapStream(const std::wstring& path) : std::ifstream(path.data(), std::ios_base::binary) { - m_memory = ov::load_mmap_object(path); -} - #endif } // namespace ov diff --git a/src/core/dev_api/openvino/op/paged_attention.hpp b/src/core/dev_api/openvino/op/paged_attention.hpp index e5995e0b8699b0..0c1c396cbefb5b 100644 --- a/src/core/dev_api/openvino/op/paged_attention.hpp +++ b/src/core/dev_api/openvino/op/paged_attention.hpp @@ -17,6 +17,11 @@ class OPENVINO_API PagedAttentionExtension : public ov::op::Op { PagedAttentionExtension(const ov::OutputVector& args); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + void set_out_type(int index, const ov::element::Type& output_type); + +protected: + std::vector m_output_type = {ov::element::undefined, ov::element::undefined}; }; } // namespace op diff --git a/src/core/dev_api/openvino/runtime/shared_buffer.hpp b/src/core/dev_api/openvino/runtime/shared_buffer.hpp index 7f1e2e9ba7601f..2c784ef6081c35 100644 --- a/src/core/dev_api/openvino/runtime/shared_buffer.hpp +++ b/src/core/dev_api/openvino/runtime/shared_buffer.hpp @@ -8,7 +8,7 @@ namespace ov { -/// \brief SharedBuffer class to store pointer to pre-allocated buffer. +/// \brief SharedBuffer class to store pointer to pre-allocated buffer. Own the shared object. template class SharedBuffer : public ov::AlignedBuffer { public: @@ -28,4 +28,60 @@ class SharedBuffer : public ov::AlignedBuffer { T _shared_object; }; +/// \brief SharedStreamBuffer class to store pointer to pre-acclocated buffer and provide streambuf interface. +/// Can return ptr to shared memory and its size +class SharedStreamBuffer : public std::streambuf { +public: + SharedStreamBuffer(char* data, size_t size) : m_data(data), m_size(size), m_offset(0) {} + +protected: + // override std::streambuf methods + std::streamsize xsgetn(char* s, std::streamsize count) override { + auto real_count = std::min(m_size - m_offset, count); + std::memcpy(s, m_data + m_offset, real_count); + m_offset += real_count; + return real_count; + } + + int_type underflow() override { + return (m_size == m_offset) ? traits_type::eof() : traits_type::to_int_type(*(m_data + m_offset)); + } + + int_type uflow() override { + return (m_size == m_offset) ? traits_type::eof() : traits_type::to_int_type(*(m_data + m_offset++)); + } + + std::streamsize showmanyc() override { + return m_size - m_offset; + } + + pos_type seekoff(off_type off, + std::ios_base::seekdir dir, + std::ios_base::openmode which = std::ios_base::in) override { + if (dir != std::ios_base::cur || which != std::ios_base::in) { + return pos_type(off_type(-1)); + } + m_offset += off; + return pos_type(m_offset); + } + + char* m_data; + size_t m_size; + size_t m_offset; +}; + +/// \brief OwningSharedStreamBuffer is a SharedStreamBuffer which owns its shared object. +class OwningSharedStreamBuffer : public SharedStreamBuffer { +public: + OwningSharedStreamBuffer(std::shared_ptr buffer) + : SharedStreamBuffer(static_cast(buffer->get_ptr()), buffer->size()), + m_shared_obj(buffer) {} + + std::shared_ptr get_buffer() { + return m_shared_obj; + } + +protected: + std::shared_ptr m_shared_obj; +}; } // namespace ov diff --git a/src/core/include/openvino/op/search_sorted.hpp b/src/core/include/openvino/op/search_sorted.hpp index 78650942ee8f0f..c370ba46b2f182 100644 --- a/src/core/include/openvino/op/search_sorted.hpp +++ b/src/core/include/openvino/op/search_sorted.hpp @@ -36,8 +36,6 @@ class OPENVINO_API SearchSorted : public Op { m_right_mode = right_mode; } - bool validate() const; - private: bool m_right_mode{}; }; diff --git a/src/core/reference/include/openvino/reference/proposal.hpp b/src/core/reference/include/openvino/reference/proposal.hpp index f80faafc5efd27..a2d727b6156aea 100644 --- a/src/core/reference/include/openvino/reference/proposal.hpp +++ b/src/core/reference/include/openvino/reference/proposal.hpp @@ -319,6 +319,11 @@ static void proposal_exec(const T* class_probs, const Shape& output_shape, const Shape& out_probs_shape, const op::v0::Proposal::Attributes& attrs) { + const auto batch_num = static_cast(class_probs_shape[0]); + const auto coordinates_offset = attrs.framework == "tensorflow" ? 0.f : 1.f; + const auto initial_clip = attrs.framework == "tensorflow"; + const auto swap_xy = attrs.framework == "tensorflow"; + const T* p_bottom_item = class_probs; const T* p_d_anchor_item = bbox_deltas; T* p_roi_item = output; @@ -328,8 +333,8 @@ static void proposal_exec(const T* class_probs, const unsigned int bottom_H = static_cast(class_probs_shape[2]); const unsigned int bottom_W = static_cast(class_probs_shape[3]); // input image height and width - const T img_H = image_shape[0]; - const T img_W = image_shape[1]; + const T img_H = image_shape[swap_xy ? 1 : 0]; + const T img_W = image_shape[swap_xy ? 0 : 1]; // scale factor for H and W, depends on shape of image_shape // can be split into H and W {image_height, image_width, scale_height, // scale_width} @@ -350,11 +355,6 @@ static void proposal_exec(const T* class_probs, std::vector anchors = generate_anchors(attrs, anchor_count); - unsigned int batch_num = static_cast(class_probs_shape[0]); - float coordinates_offset = attrs.framework == "tensorflow" ? 0.0f : 1.0f; - bool initial_clip = attrs.framework == "tensorflow"; - bool swap_xy = attrs.framework == "tensorflow"; - for (unsigned int batch_idx = 0; batch_idx < batch_num; ++batch_idx) { std::fill(roi_indices.begin(), roi_indices.end(), 0); num_rois = 0; diff --git a/src/core/shape_inference/include/search_sorted_shape_inference.hpp b/src/core/shape_inference/include/search_sorted_shape_inference.hpp index 7ea0598cffbc87..4b9d888891e835 100644 --- a/src/core/shape_inference/include/search_sorted_shape_inference.hpp +++ b/src/core/shape_inference/include/search_sorted_shape_inference.hpp @@ -12,28 +12,40 @@ namespace op { namespace v15 { template > std::vector shape_infer(const SearchSorted* op, const std::vector& input_shapes) { - // [HACK]: By convention, shape_infer should also perform node validation.. - op->validate(); const auto& sorted_shape = input_shapes[0]; const auto& values_shape = input_shapes[1]; + const auto is_sorted_rank_static = sorted_shape.rank().is_static(); + const auto is_values_rank_static = values_shape.rank().is_static(); - auto output_shape = values_shape; - - // 1. If we know that the sorted sequence is 1D, than output shape can be anything. - if (sorted_shape.rank().is_static() && sorted_shape.rank().get_length() == 1) { - return {std::move(output_shape)}; + if (!is_sorted_rank_static || sorted_shape.size() == 1) { + // If the sorted sequence is 1D, then any shape of the values input is allowed. + // The shape of the output is the same as the shape of the values. + return {values_shape}; } - // 2. ND tensor case or rank not known. - auto sorted_shape_last_dynamic = sorted_shape; - if (sorted_shape.rank().is_static()) { - sorted_shape_last_dynamic[sorted_shape.rank().get_length() - 1] = Dimension::dynamic(); + const auto sorted_in_rank = sorted_shape.size(); + NODE_SHAPE_INFER_CHECK(op, input_shapes, sorted_in_rank > 0, "The sorted sequence input cannot be a scalar."); + + TRShape output_shape; + if (!is_values_rank_static) { + output_shape = sorted_shape; + output_shape[sorted_in_rank - 1] = Dimension::dynamic(); + } else { + output_shape = values_shape; + NODE_SHAPE_INFER_CHECK( + op, + input_shapes, + sorted_in_rank == values_shape.size(), + "If the shape of sorted sequence is not 1D, the ranks of the inputs have to be compatible."); + using TDim = typename TShape::value_type; + for (size_t i = 0; i < sorted_in_rank - 1; ++i) { + NODE_SHAPE_INFER_CHECK(op, + input_shapes, + TDim::merge(output_shape[i], values_shape[i], sorted_shape[i]), + "All dimensions but the last one have to be compatible."); + } } - const bool sorted_values_merge_success = TShape::merge_into(output_shape, sorted_shape_last_dynamic); - - NODE_VALIDATION_CHECK(op, sorted_values_merge_success, "Shapes of sorted sequence and values are not compatible."); - return {std::move(output_shape)}; } } // namespace v15 diff --git a/src/core/src/op/paged_attention.cpp b/src/core/src/op/paged_attention.cpp index e3771bcbf92937..261b0ce1c47605 100644 --- a/src/core/src/op/paged_attention.cpp +++ b/src/core/src/op/paged_attention.cpp @@ -146,13 +146,27 @@ void PagedAttentionExtension::validate_and_infer_types() { get_input_element_type(12), "."); - set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); - set_output_type(1, get_input_element_type(0), {Dimension::dynamic()}); + if (m_output_type[0] == ov::element::undefined) { + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); + } else { + set_output_type(0, m_output_type[0], get_input_partial_shape(0)); + } + + if (m_output_type[1] == ov::element::undefined) { + set_output_type(1, get_input_element_type(0), {Dimension::dynamic()}); + } else { + set_output_type(1, m_output_type[1], {Dimension::dynamic()}); + } } std::shared_ptr PagedAttentionExtension::clone_with_new_inputs(const ov::OutputVector& new_args) const { return std::make_shared(new_args); } +void PagedAttentionExtension::set_out_type(int index, const ov::element::Type& output_type) { + OPENVINO_ASSERT(index < 2, "Output index should be 0 or 1, but got " + std::to_string(index)); + m_output_type[index] = output_type; +} + } // namespace op } // namespace ov diff --git a/src/core/src/op/search_sorted.cpp b/src/core/src/op/search_sorted.cpp index d3f26a674eef91..8b9bb012b27106 100644 --- a/src/core/src/op/search_sorted.cpp +++ b/src/core/src/op/search_sorted.cpp @@ -18,34 +18,11 @@ SearchSorted::SearchSorted(const Output& sorted_sequence, const Output 1) { - NODE_VALIDATION_CHECK(this, - sorted_shape.rank().get_length() == values_shape.rank().get_length(), - "Sorted sequence and values have different ranks."); - - for (int64_t i = 0; i < sorted_shape.rank().get_length() - 1; ++i) { - NODE_VALIDATION_CHECK(this, - sorted_shape[i].compatible(values_shape[i]), - "Sorted sequence and values has different ", - i, - " dimension."); - } - } - - return true; -} - -void SearchSorted::validate_and_infer_types() { - OV_OP_SCOPE(v15_SearchSorted_validate_and_infer_types); const auto& output_shapes = shape_infer(this, ov::util::get_node_input_partial_shapes(*this)); set_output_type(0, ov::element::i64, output_shapes[0]); } diff --git a/src/core/tests/type_prop/search_sorted.cpp b/src/core/tests/type_prop/search_sorted.cpp index efc2c865416143..6dd10ad0ac3f5f 100644 --- a/src/core/tests/type_prop/search_sorted.cpp +++ b/src/core/tests/type_prop/search_sorted.cpp @@ -57,6 +57,10 @@ TEST(type_prop, search_sorted_shape_infer_sorted_1d_values_dynamic) { PerformShapeTest({8}, {-1, -1, 3}, {-1, -1, 3}); } +TEST(type_prop, search_sorted_shape_infer_scalar_values) { + PerformShapeTest({100}, {}, {}); +} + TEST(type_prop, search_sorted_shape_infer_both_dynamic_1) { PerformShapeTest({1, -1, 7, -1}, {-1, 3, -1, 10}, {1, 3, 7, 10}); } @@ -93,6 +97,19 @@ TEST(type_prop, search_sorted_shape_infer_both_dynamic_9) { PerformShapeTest({-1, -1}, PartialShape::dynamic(), {-1, -1}); } +TEST(type_prop, search_sorted_shape_symbols) { + PartialShape sorted_shape{1, 3, 7, 100}; + PartialShape values_shape{-1, -1, -1, 10}; + auto sorted_symbols = set_shape_symbols(sorted_shape); + auto values_symbols = set_shape_symbols(values_shape); + auto sorted = make_shared(element::i32, sorted_shape); + auto values = make_shared(element::i32, values_shape); + auto search_sorted_op = make_shared(sorted, values); + EXPECT_EQ(search_sorted_op->get_element_type(), element::i64); + EXPECT_THAT(get_shape_symbols(search_sorted_op->get_output_partial_shape(0)), + testing::ElementsAre(values_symbols[0], values_symbols[1], values_symbols[2], values_symbols[3])); +} + TEST(type_prop, search_sorted_shape_infer_different_types) { auto sorted = make_shared(element::f32, Shape{1, 3, 6}); auto values = make_shared(element::i32, Shape{1, 3, 6}); @@ -102,13 +119,27 @@ TEST(type_prop, search_sorted_shape_infer_different_types) { TEST(type_prop, search_sorted_shape_infer_wrong_rank) { auto sorted = make_shared(element::i32, Shape{1, 1, 3, 6}); auto values = make_shared(element::i32, Shape{1, 3, 6}); - EXPECT_THROW_SUBSTRING(sorted, values, std::string("Sorted sequence and values have different ranks")); + EXPECT_THROW_SUBSTRING(sorted, + values, + std::string("sequence is not 1D, the ranks of the inputs have to be compatible")); } TEST(type_prop, search_sorted_shape_infer_wrong_dim) { auto sorted = make_shared(element::i32, Shape{1, 1, 3, 6}); auto values = make_shared(element::i32, Shape{1, 1, 5, 6}); - EXPECT_THROW_SUBSTRING(sorted, values, std::string(" different 2 dimension.")); + EXPECT_THROW_SUBSTRING(sorted, values, std::string("All dimensions but the last one have to be compatible")); +} + +TEST(type_prop, search_sorted_shape_infer_scalar_sorted_sequence) { + auto sorted = make_shared(element::i32, Shape{}); + auto values = make_shared(element::i32, Shape{1, 1, 5, 6}); + EXPECT_THROW_SUBSTRING(sorted, values, std::string("The sorted sequence input cannot be a scalar")); +} + +TEST(type_prop, search_sorted_shape_infer_scalar_values_and_nd_sequence) { + auto sorted = make_shared(element::i32, Shape{2, 2}); + auto values = make_shared(element::i32, Shape{}); + EXPECT_THROW_SUBSTRING(sorted, values, std::string("the ranks of the inputs have to be compatible")); } #undef EXPECT_THROW_SUBSTRING \ No newline at end of file diff --git a/src/frontends/onnx/frontend/CMakeLists.txt b/src/frontends/onnx/frontend/CMakeLists.txt index 80fd16e2ed6483..f07b6cf999fea8 100644 --- a/src/frontends/onnx/frontend/CMakeLists.txt +++ b/src/frontends/onnx/frontend/CMakeLists.txt @@ -77,7 +77,7 @@ ov_add_frontend(NAME onnx FILEDESCRIPTION "FrontEnd to load and convert ONNX file format" LINK_LIBRARIES openvino_onnx_common openvino::core::dev) -set(ONNX_OPSET_VERSION 20 CACHE INTERNAL "Supported version of ONNX operator set") +set(ONNX_OPSET_VERSION 21 CACHE INTERNAL "Supported version of ONNX operator set") target_compile_definitions(${TARGET_NAME} PRIVATE ONNX_OPSET_VERSION=${ONNX_OPSET_VERSION}) if(BUILD_SHARED_LIBS) diff --git a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp index fe2ea3106e31ee..b09bc73467bc10 100644 --- a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp +++ b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp @@ -11,10 +11,15 @@ #include "openvino/frontend/exception.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/convert.hpp" +#include "openvino/op/convert_like.hpp" #include "openvino/op/multiply.hpp" #include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" #include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/unsqueeze.hpp" #include "utils/common.hpp" +#include "utils/reshape.hpp" using namespace ov::op; namespace ov { @@ -188,8 +193,78 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { // these reshapes make sure that dequantization happens over the specified axis return detail::dequantize_linear(x, scale, zero_point, node.get_attribute_value("axis", 1), node); } -ONNX_OP("DequantizeLinear", OPSET_SINCE(13), ai_onnx::opset_13::dequantize_linear); +ONNX_OP("DequantizeLinear", {13, 18}, ai_onnx::opset_13::dequantize_linear); } // namespace opset_13 + +namespace opset_19 { +ONNX_OP("DequantizeLinear", {19, 20}, ai_onnx::opset_13::dequantize_linear); +} // namespace opset_19 + +namespace opset_21 { +ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { + common::default_op_checks(node, 2); + + const ov::OutputVector inputs{node.get_ov_inputs()}; + const auto& src_x = inputs[0]; + ov::Output scale = inputs[1]; + const auto& scale_shape = scale.get_partial_shape(); + ov::Output zp; + + // When no blocking dequantization is required - use regular DequantizeLinear + if (scale_shape.rank().is_static() && scale_shape.rank().get_length() <= 1) { + return ai_onnx::opset_13::dequantize_linear(node); + } + + FRONT_END_GENERAL_CHECK(scale_shape.rank().is_static(), "Rank of the input data tensor has to be known (static)."); + FRONT_END_GENERAL_CHECK(scale_shape.rank().get_length() == 2, + "DequantizeLinear cannot operate with more than 2D scales"); + FRONT_END_GENERAL_CHECK(src_x.get_partial_shape().is_static(), + "DequantizeLinear cannot operate with dynamic shapes of input X"); + + const auto& unsqueezed_axes = std::make_shared(ov::element::i64, Shape{1}, std::vector{1}); + + if (inputs.size() > 2) { + zp = inputs[2]; + if (zp.get_element_type() != scale.get_element_type()) { + zp = std::make_shared(zp, scale); + } + zp = std::make_shared(zp, unsqueezed_axes); + } + + const auto axis = node.get_attribute_value("axis", 1); + const auto block_size = static_cast(node.get_attribute_value("block_size", 0)); + const auto scale_type = scale.get_element_type(); + + FRONT_END_GENERAL_CHECK(axis == 0, "Axis != 0 isn't supported"); + FRONT_END_GENERAL_CHECK(block_size > 0, "block_size must be greater than zero"); + FRONT_END_GENERAL_CHECK( + src_x.get_shape()[0] % block_size == 0, + "DequantizeLinear doesn't support case when first dimension of X cannot be divided by block_size"); + + const auto& x = src_x.get_element_type() == scale_type ? src_x : std::make_shared(src_x, scale); + // For further broadcasting scales and zp - reshape input to a shape [x.shape[0]/block_size, block_size, x.shape[1]] + ov::Output broadcastable_x = + op::util::reshape(x, Shape{static_cast(x.get_shape()[0]) / block_size, block_size, x.get_shape()[1]}); + + // Adding additional dimension for broadcasting + scale = std::make_shared(scale, unsqueezed_axes); + + if (zp.get_node_shared_ptr()) { + broadcastable_x = std::make_shared(broadcastable_x, zp); + } + + const auto& scaled_x = std::make_shared(broadcastable_x, scale); + + // Returning back a shape + const auto& reshaped_scaled_x = + std::make_shared(scaled_x, std::make_shared(src_x), false); + + reshaped_scaled_x->set_friendly_name(node.get_name()); + + return {reshaped_scaled_x}; +} +ONNX_OP("DequantizeLinear", OPSET_SINCE(21), ai_onnx::opset_21::dequantize_linear); +} // namespace opset_21 } // namespace ai_onnx } // namespace onnx } // namespace frontend diff --git a/src/frontends/onnx/tests/models/dequantize_linear_21.prototxt b/src/frontends/onnx/tests/models/dequantize_linear_21.prototxt new file mode 100644 index 00000000000000..0378ad13ce0ce9 --- /dev/null +++ b/src/frontends/onnx/tests/models/dequantize_linear_21.prototxt @@ -0,0 +1,63 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + name: "test_dequantize_21" + initializer { + dims: 6 + dims: 3 + data_type: 21 + name: "data" + raw_data: "\x99\x99\x99\x99\x99\x99\x99\x99\x99" + } + initializer { + dims: 2 + dims: 3 + data_type: 1 + name: "scale" + raw_data: "\x00\x00\x80\x3f\x00\x00\x80\x3f\x00\x00\x80\x3f\x00\x00\x80\x3f\x00\x00\x80\x3f\x00\x00\x80\x3f" + } + initializer { + dims: 2 + dims: 3 + data_type: 21 + name: "zp" + raw_data: "\x78\x56\x34" + } + node { + input: "data" + input: "scale" + input: "zp" + output: "output" + name: "DequantizeNode" + op_type: "DequantizeLinear" + attribute { + name: "axis" + i: 0 + type: INT + } + attribute { + name: "block_size" + i: 3 + type: INT + } + } + output { + name: "output" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 6 + } + dim { + dim_value: 3 + } + } + } + } + } +} +opset_import { + version: 21 +} diff --git a/src/frontends/onnx/tests/onnx_import_quant.in.cpp b/src/frontends/onnx/tests/onnx_import_quant.in.cpp index c2d48c292cb8c1..ad85ef98ede8d9 100644 --- a/src/frontends/onnx/tests/onnx_import_quant.in.cpp +++ b/src/frontends/onnx/tests/onnx_import_quant.in.cpp @@ -317,6 +317,16 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_model_dequantize_linear_1d_zero_scale_uint8_ test_case.run(); } +OPENVINO_TEST(${BACKEND_NAME}, onnx_model_dequantize_linear_opset21) { + auto model = convert_model("dequantize_linear_21.onnx"); + + auto test_case = ov::test::TestCase(model, s_device); + + test_case.add_expected_output({6, 3}, + std::vector{1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6}); + test_case.run(); +} + OPENVINO_TEST(${BACKEND_NAME}, onnx_model_dequantize_linear_scalar_ignore_axis) { auto model = convert_model("dequantize_linear_scalar_ignore_axis.onnx"); diff --git a/src/inference/src/cache_manager.hpp b/src/inference/src/cache_manager.hpp index 9e9ebd3ddcbc2b..c441811c3cfd02 100644 --- a/src/inference/src/cache_manager.hpp +++ b/src/inference/src/cache_manager.hpp @@ -14,6 +14,7 @@ #include #include +#include "openvino/runtime/shared_buffer.hpp" #include "openvino/util/file_util.hpp" #include "openvino/util/mmap_object.hpp" @@ -77,9 +78,10 @@ class ICacheManager { * Otherwise, model will not be read from cache and will be loaded as usual * * @param id Id of cache (hash of the model) + * @param enable_mmap use mmap or ifstream to read model file * @param reader Lambda function to be called when input stream is created */ - virtual void read_cache_entry(const std::string& id, StreamReader reader, bool mmap = false) = 0; + virtual void read_cache_entry(const std::string& id, bool enable_mmap, StreamReader reader) = 0; /** * @brief Callback when OpenVINO intends to remove cache entry @@ -130,13 +132,17 @@ class FileStorageCacheManager final : public ICacheManager { writer(stream); } - void read_cache_entry(const std::string& id, StreamReader reader, bool mmap = false) override { + void read_cache_entry(const std::string& id, bool enable_mmap, StreamReader reader) override { // Fix the bug caused by pugixml, which may return unexpected results if the locale is different from "C". ScopedLocale plocal_C(LC_ALL, "C"); auto blob_file_name = getBlobFile(id); if (ov::util::file_exists(blob_file_name)) { - if (mmap) { - MmapStream stream(blob_file_name); + if (enable_mmap) { + auto mmap = ov::load_mmap_object(blob_file_name); + auto shared_buffer = + std::make_shared>>(mmap->data(), mmap->size(), mmap); + OwningSharedStreamBuffer buf(shared_buffer); + std::istream stream(&buf); reader(stream); } else { std::ifstream stream(blob_file_name, std::ios_base::binary); diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index 9f55dc53ccd24f..32b43f346e9e44 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -1397,19 +1397,12 @@ ov::SoPtr ov::CoreImpl::compile_model_and_cache(ov::Plugin& return compiled_model; } -static bool does_plugin_support_model_caching_with_mmap(const ov::Plugin& plugin) { - bool supported = plugin.supports_model_caching(); - supported &= - ov::util::contains(plugin.get_property(ov::internal::supported_properties), ov::internal::caching_with_mmap); - return supported; -} - ov::SoPtr ov::CoreImpl::load_model_from_cache( const CacheContent& cacheContent, ov::Plugin& plugin, const ov::AnyMap& config, const ov::SoPtr& context, - std::function()> compile_model_lambda) { + std::function()> compile_model_lambda) const { ov::SoPtr compiled_model; struct HeaderException {}; @@ -1418,6 +1411,8 @@ ov::SoPtr ov::CoreImpl::load_model_from_cache( try { cacheContent.cacheManager->read_cache_entry( cacheContent.blobId, + coreConfig.get_enable_mmap() && ov::util::contains(plugin.get_property(ov::internal::supported_properties), + ov::internal::caching_with_mmap), [&](std::istream& networkStream) { OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::LoadTime, @@ -1454,8 +1449,7 @@ ov::SoPtr ov::CoreImpl::load_model_from_cache( update_config[ov::loaded_from_cache.name()] = true; compiled_model = context ? plugin.import_model(networkStream, context, update_config) : plugin.import_model(networkStream, update_config); - }, - does_plugin_support_model_caching_with_mmap(plugin)); + }); } catch (const HeaderException&) { // For these exceptions just remove old cache and set that import didn't work cacheContent.cacheManager->remove_cache_entry(cacheContent.blobId); diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 79b1b96d57ac30..7cf12f3ba3280c 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -149,12 +149,12 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this& context, const CacheContent& cacheContent) const; - static ov::SoPtr load_model_from_cache( + ov::SoPtr load_model_from_cache( const CacheContent& cacheContent, ov::Plugin& plugin, const ov::AnyMap& config, const ov::SoPtr& context, - std::function()> compile_model_lambda); + std::function()> compile_model_lambda) const; bool device_supports_model_caching(const ov::Plugin& plugin) const; diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index a4c99e2cc1fca7..d6208e0a43bbe1 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -1982,6 +1982,51 @@ std::set> jit_soft_sign_emitter::get_supported_precis return {{element::f32}}; } +/// SQUARE_ROOT /// +jit_sqrt_emitter::jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + prepare_table(); + } + +jit_sqrt_emitter::jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { + prepare_table(); + } + +size_t jit_sqrt_emitter::get_inputs_count() const { + return 1; +} + +void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_sqrt_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + TReg src = TReg(in_vec_idxs[0]); + TReg dst = TReg(out_vec_idxs[0]); + + h->fsqrt(dst.s, src.s); +} + +std::set> jit_sqrt_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::f32}}; +} + /// SUBTRACT /// jit_subtract_emitter::jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index ccd82bc5b628e7..afecd3029f58db 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -800,14 +800,34 @@ class jit_soft_sign_emitter : public jit_emitter { static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: - std::unique_ptr exp_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; }; +class jit_sqrt_emitter : public jit_emitter { +public: + jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; + + template + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; +}; + class jit_subtract_emitter : public jit_emitter { public: jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index a17b8d28e17f5d..dc0f953efe70ab 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -194,8 +194,8 @@ void Graph::Replicate(const std::shared_ptr &model, const auto port = unusedOutput.get_index(); const auto nodeName = std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName(); const NodePtr outNode = std::make_shared(parentNode->outputShapes[port], - parentNode->getOriginalOutputPrecisionAtPort(port), - nodeName, "Result", m_context); + parentNode->getOriginalOutputPrecisionAtPort(port), + nodeName, "Result", m_context); CreateEdge(parentNode, outNode, port, 0); AddNode(outNode); } diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 34e48dea50cbfa..7c23d55fc4147a 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -655,7 +655,7 @@ std::vector Node::getChildEdgesAtPort(int inputNum) const { if (!edge) OPENVINO_THROW("Node ", getName(), " contains dead weak ptr"); if (edge->getInputNum() == inputNum) - res.push_back(edge); + res.emplace_back(std::move(edge)); } return res; } @@ -793,11 +793,10 @@ void Node::redefineOutputMemory(const std::vector &newOutputShapes) void Node::redefineOutputMemory(const size_t port, const VectorDims& new_output_shape) { const auto edges = getChildEdgesAtPort(port); + static const VectorDims single_element_shape = {1}; + // avoid 0D shape incompatible - auto new_shape = new_output_shape; - if (new_shape.empty()) { - new_shape.push_back(1); - } + const auto& new_shape = new_output_shape.empty() ? single_element_shape : new_output_shape; const auto& curr_desc = edges[0]->getMemory().getDesc(); if (curr_desc.getShape().isStatic() && curr_desc.getShape().getStaticDims() == new_shape) { diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 7848e479f175e4..586e7f0705643f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -48,6 +48,7 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwiseSelect, Algorithm::EltwiseSigmoid, Algorithm::EltwiseSoftSign, + Algorithm::EltwiseSqrt, Algorithm::EltwiseSubtract, Algorithm::EltwiseSwish, Algorithm::EltwiseTanh); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index 59a5f812499481..98eb279bb26d48 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -670,6 +670,7 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwiseSelect, ov::intel_cpu::aarch64::jit_select_emitter), OV_CASE(Algorithm::EltwiseSigmoid, ov::intel_cpu::aarch64::jit_sigmoid_emitter), OV_CASE(Algorithm::EltwiseSoftSign, ov::intel_cpu::aarch64::jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseSqrt, ov::intel_cpu::aarch64::jit_sqrt_emitter), OV_CASE(Algorithm::EltwiseSubtract, ov::intel_cpu::aarch64::jit_subtract_emitter), OV_CASE(Algorithm::EltwiseSwish, ov::intel_cpu::aarch64::jit_swish_emitter), OV_CASE(Algorithm::EltwiseTanh, ov::intel_cpu::aarch64::jit_tanh_emitter)); @@ -847,6 +848,7 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), OV_CASE(Algorithm::EltwiseSigmoid, jit_sigmoid_emitter), OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter), OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), OV_CASE(Algorithm::EltwiseSwish, jit_swish_emitter), OV_CASE(Algorithm::EltwiseTanh, jit_tanh_emitter)); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp index a9998e88402ca7..d95f973fa9f2f0 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp @@ -180,7 +180,6 @@ static void attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src, // For compatibility, all input_kvs are permuted to BHLS size_t B = k_src.m_dims[0], H = k_src.m_dims[1], L1 = k_src.m_dims[2], S = k_src.m_dims[3]; // Internal LBHS layout has strides[L] > strides[B] - assert(k_src.m_strides[2] > k_src.m_strides[0]); parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) { auto p_k = k_scale_zp.ptr(m, b, h); auto p_v = v_scale_zp.ptr(m, b, h); @@ -238,6 +237,8 @@ void attn_quantkv(const ov::intel_cpu::PlainTensor& k_src, attn_quant_mt(k_src, v_src, k_dst, v_dst, k_scale_zp, v_scale_zp); } else if (k_src.get_precision() == ov::element::bf16 && k_dst.get_precision() == ov::element::u8) { attn_quant_mt(k_src, v_src, k_dst, v_dst, k_scale_zp, v_scale_zp); + } else if (k_src.get_precision() == ov::element::f16 && k_dst.get_precision() == ov::element::u8) { + attn_quant_mt(k_src, v_src, k_dst, v_dst, k_scale_zp, v_scale_zp); } else { OPENVINO_THROW("unsupport src type: ", k_src.get_precision(), ", dst type: ", k_dst.get_precision(), " in attn_quantkv"); } @@ -252,6 +253,8 @@ void paged_attn_quantkv(const ov::intel_cpu::PlainTensor& k_src, paged_attn_quant_mt(k_src, v_src, k_dst, v_dst, slot_mapping); } else if (k_src.get_precision() == ov::element::bf16 && k_dst.get_precision() == ov::element::u8) { paged_attn_quant_mt(k_src, v_src, k_dst, v_dst, slot_mapping); + } else if (k_src.get_precision() == ov::element::f16 && k_dst.get_precision() == ov::element::u8) { + paged_attn_quant_mt(k_src, v_src, k_dst, v_dst, slot_mapping); } else { OPENVINO_THROW("unsupport src type: ", k_src.get_precision(), ", dst type: ", k_dst.get_precision(), " in paged_attn_quantkv"); } diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp index 3341f6f6082d99..2956c8a6a6b5b8 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp @@ -37,15 +37,22 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); return _mm512_castsi512_ps(_mm512_slli_epi32(y, 16)); } + // load addr to __m512 reg + inline __m512 mm512_uni_loadu_ps(const float* a) { + return _mm512_loadu_ps(a); + } + inline __m512 mm512_uni_loadu_ps(const ov::bfloat16* a) { auto vec_bf16 = _mm256_loadu_si256(reinterpret_cast(a)); return cvt_bf16_to_fp32(vec_bf16); } - inline __m512 mm512_uni_loadu_ps(const float* a) { - return _mm512_loadu_ps(a); + inline __m512 mm512_uni_loadu_ps(const ov::float16* a) { + auto vec_f16 = _mm256_loadu_si256(reinterpret_cast(a)); + return _mm512_cvtph_ps(vec_f16); } + // load addr to __m512 reg inline __m512 mm512_uni_loadu_tail_ps(const float* a, size_t count) { __mmask16 mask = (1 << count) - 1; return _mm512_maskz_loadu_ps(mask, a); @@ -57,6 +64,13 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); return cvt_bf16_to_fp32(bf16_vec); } + inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) { + auto mask = (1 << count) - 1; + auto f16_vec = _mm256_maskz_loadu_epi16(mask, a); + return _mm512_cvtph_ps(f16_vec); + } + + // store __m512 reg to addr inline void mm512_uni_storeu_ps(float* a, __m512 v) { _mm512_storeu_ps(a, v); } @@ -72,6 +86,13 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), _mm512_cvtepi32_epi16(x)); } + + inline void mm512_uni_storeu_ps(ov::float16* addr, __m512 v) { + __m256i vec_f16 = _mm512_cvtps_ph(v, 0); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), vec_f16); + } + + // store __m512 reg to addr inline void mm512_uni_mask_storeu_ps(ov::bfloat16 *addr, __mmask16 mask_addr, __m512 xps) { __m512i xpi32 = _mm512_castps_si512(xps); __m512i nan = _mm512_set1_epi32(0xffff); @@ -85,18 +106,29 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); } - inline __m512 mm512_uni_loadu_ps(ov::float16* a) { - auto vec_f16 = _mm256_loadu_si256(reinterpret_cast(a)); - return _mm512_cvtph_ps(vec_f16); + inline void mm512_uni_storeu_tail_ps(float *addr, __m512 v, size_t count) { + __mmask16 mask_addr = (1 << count) - 1; + _mm512_mask_storeu_ps(addr, mask_addr, v); } - inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) { - auto mask = (1 << count) - 1; - auto f16_vec = _mm256_maskz_loadu_epi16(mask, a); - return _mm512_cvtph_ps(f16_vec); + + inline void mm512_uni_storeu_tail_ps(ov::bfloat16 *addr, __m512 v, size_t count) { + __mmask16 mask_addr = (1 << count) - 1; + __m512i xpi32 = _mm512_castps_si512(v); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask = _mm512_cmp_ps_mask(v, v, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] + x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB + x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; + x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 + _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); } - inline void mm512_uni_storeu_ps(ov::float16* addr, __m512 v) { + + inline void mm512_uni_storeu_tail_ps(ov::float16 *addr, __m512 v, size_t count) { + __mmask16 mask_addr = (1 << count) - 1; __m256i vec_f16 = _mm512_cvtps_ph(v, 0); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), vec_f16); + _mm256_mask_storeu_epi16(reinterpret_cast<__m256i *>(addr), mask_addr, vec_f16); } #endif @@ -115,12 +147,11 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); }; return _mm256_loadu_si256(&mask[N7]); } + + // load addr to __m256 reg inline __m256 mm256_uni_loadu_ps(const float* a) { return _mm256_loadu_ps(a); } - inline void mm256_uni_storeu_ps(float* a, __m256 v) { - _mm256_storeu_ps(a, v); - } inline __m256 mm256_uni_loadu_ps(const ov::bfloat16* a) { auto vec_bf16 = _mm_loadu_si128(reinterpret_cast(a)); @@ -128,6 +159,13 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); return o; } + inline __m256 mm256_uni_loadu_ps(const ov::float16* a) { + auto vec_f16 = _mm_loadu_si128(reinterpret_cast(a)); + auto o = _mm256_cvtph_ps(vec_f16); + return o; + } + + // load addr tail to __m256 reg inline __m256 mm256_uni_loadu_tail_ps(const float* a, const size_t count) { auto mask = get_mask(count); return _mm256_maskload_ps(a, mask); @@ -140,6 +178,17 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); return mm256_uni_loadu_ps(tmp_values); } + inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) { + ov::float16 tmp_values[8] = {0}; + std::memcpy(tmp_values, a, count * sizeof(ov::float16)); + return mm256_uni_loadu_ps(tmp_values); + } + + // store __m256 reg to addr + inline void mm256_uni_storeu_ps(float* a, __m256 v) { + _mm256_storeu_ps(a, v); + } + inline void mm256_uni_storeu_ps(ov::bfloat16 *addr, __m256 xps) { __m256i xpi32 = _mm256_castps_si256(xps); __m256i nan = _mm256_set1_epi32(0xffff); @@ -156,21 +205,17 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); _mm_storeu_si128(reinterpret_cast<__m128i *>(addr), bf16_o); } - inline __m256 mm256_uni_loadu_ps(ov::float16* a) { - auto vec_f16 = _mm_loadu_si128(reinterpret_cast<__m128i*>(a)); - auto o = _mm256_cvtph_ps(vec_f16); - return o; - } - inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) { - ov::float16 tmp_values[8] = {0}; - std::memcpy(tmp_values, a, count * sizeof(ov::float16)); - return mm256_uni_loadu_ps(tmp_values); - } inline void mm256_uni_storeu_ps(ov::float16* a, __m256 v) { __m128i vec_f16 = _mm256_cvtps_ph(v, 0); _mm_storeu_si128(reinterpret_cast<__m128i *>(a), vec_f16); } + // store __m256 to addr + inline void mm256_uni_storeu_tail_ps(float *addr, __m256 v, size_t count) { + const auto mask = get_mask(count); + return _mm256_maskstore_ps(addr, mask, v); + } + inline void hsum(__m256& x) { __m256 y; // x: 0 1 2 3 4 5 6 7 y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 @@ -292,4 +337,4 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); } // namespace XARCH } // namespace Cpu } // namespace Extensions -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp index 1fe7b811b922a8..971aa6bb58c994 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp @@ -14,6 +14,7 @@ #endif #include "openvino/core/type/bfloat16.hpp" +#include "openvino/core/type/float16.hpp" #include "openvino/core/parallel.hpp" #include "executor_pa.hpp" #include "executor_pa_common.hpp" @@ -619,7 +620,8 @@ void transpose_16NxK(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t } #if defined(HAVE_AVX512F) -static void transpose_16NxK(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { +template::value || std::is_same::value), bool>::type> +static void transpose_16NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // will treat as uint32_t transpose auto s = reinterpret_cast(src); auto d = reinterpret_cast(dst); @@ -669,8 +671,8 @@ void dequant(TDST* dst, uint8_t* src, size_t N, size_t K) { } #if defined(HAVE_AVX512F) -// pack bf16/u8 to bf16 -static void pack_32x32_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_stride, size_t src_stride) { +template::value || std::is_same::value), bool>::type> +static void pack_32x32_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); for (size_t i = 0; i < 16; i++) { @@ -687,7 +689,8 @@ static void pack_32x32_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_s } } -static void pack_32x16_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_stride, size_t src_stride) { +template::value || std::is_same::value), bool>::type> +static void pack_32x16_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); for (size_t i = 0; i < 16; i++) { @@ -704,7 +707,8 @@ static void pack_32x16_kernel(ov::bfloat16* dst, ov::bfloat16* src, size_t dst_s } } -static void pack_32Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { +template::value || std::is_same::value), bool>::type> +static void pack_32Nx16K(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { for (size_t n = 0; n < N; n += 32) { size_t k = 0; for (; k + 32 <= K; k += 32) { @@ -718,7 +722,8 @@ static void pack_32Nx16K(ov::bfloat16* dst, ov::bfloat16* src, ov::bfloat16* tmp } } -static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { +template::value || std::is_same::value), bool>::type> +static void pack_32Nx16K(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // The layout for per token per head: // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) @@ -730,7 +735,7 @@ static void pack_32Nx16K(ov::bfloat16* dst, uint8_t* src, ov::bfloat16* tmp, siz s += src_stride + 2 * sizeof(float); t += src_stride; } - pack_32Nx16K(dst, tmp, reinterpret_cast(0), N, K, dst_stride, src_stride); + pack_32Nx16K(dst, tmp, reinterpret_cast(0), N, K, dst_stride, src_stride); } #endif @@ -769,7 +774,7 @@ struct MHAHelper { std::vector> _wv_gemm_acc; // second token std::shared_ptr _gemv; - bool _fastpath_valid = false; + ov::element::Type _fastpath_valid_prec = ov::element::undefined; // second token for bhl loop PlainTensor _weight_bhl; PlainTensor _output_bhl; @@ -851,11 +856,20 @@ struct MHAHelper { _qk_scratch_a.resize({_nthr, _qk_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); _wv_scratch_a.resize({_nthr, _wv_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); - _fastpath_valid = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_bf16) && - (S % 32 == 0) && (block_size % 16 == 0) && (S <= 32 * 6) && precision_of::value == ov::element::bf16; - // aligned to cache line (64bytes=16*sizeof(float)) to avoid false sharing - if (_fastpath_valid && !_gemv) - _gemv = std::make_shared(static_cast(S), static_cast(block_size)); + if ((S % 32 == 0) && (block_size % 16 == 0) && (S <= 32 * 6)) { + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_bf16) && + precision_of::value == ov::element::bf16 && + precision_of::value == ov::element::bf16) { + _fastpath_valid_prec = ov::element::bf16; + } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_fp16) && + precision_of::value == ov::element::f16 && + precision_of::value == ov::element::f16) { + _fastpath_valid_prec = ov::element::f16; + } + } + if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16) && !_gemv) { + _gemv = std::make_shared(static_cast(S), static_cast(block_size), _fastpath_valid_prec); + } } if (init_alibi_lookup && (!_alibi_lookup || _alibi_lookup.m_dims[0] < kv_len)) { @@ -903,7 +917,7 @@ struct MHAHelper { auto q_start = q_blk * _block_size; auto q_end = std::min(q_start + _block_size, q_len); auto q_cnt = q_end - q_start; - constexpr bool q_is_bf16 = precision_of::value == ov::element::bf16; + constexpr bool q_is_xf16 = one_of(precision_of::value, ov::element::bf16, ov::element::f16); constexpr bool q_cache_is_same = precision_of::value == precision_of::value; auto cur_kv_len_blocks = div_up(cur_kv_len, _block_size); for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { @@ -978,12 +992,12 @@ struct MHAHelper { // reuse float buffer, need to use float to compute offset auto* w_ptr = reinterpret_cast(_weight.ptr(ithr, h, 0, 0)); - float* fp32_out_ptr = q_is_bf16 ? _output.ptr(ithr, 0, h, 0) : output_emb.ptr(q_start, h * _S); + float* fp32_out_ptr = q_is_xf16 ? _output.ptr(ithr, 0, h, 0) : output_emb.ptr(q_start, h * _S); // for each weight block, loop through all value block for (size_t v_blk = 0; v_blk < cur_kv_len_blocks; v_blk++) { DATA_TYPE* v_ptr; - if (q_is_bf16 || !q_cache_is_same) { + if (q_is_xf16 || !q_cache_is_same) { v_ptr = wv_scratch_b.ptr(v_blk, hk); } else { v_ptr = present_value.ptr(block_table[v_blk], hk); @@ -1004,11 +1018,11 @@ struct MHAHelper { _wv_scratch_a ? _wv_scratch_a.ptr(ithr, 0) : nullptr); } } - if (q_is_bf16) { + if (q_is_xf16) { attn_memcpy2d_kernel(_output.ptr(ithr, 0, h, 0), output_emb.ptr(q_start, h * _S), ov::element::f32, - ov::element::bf16, + precision_of::value, _output.stride(1), output_emb.stride(0), _S, @@ -1026,13 +1040,13 @@ struct MHAHelper { // output: [nthr, 32, H, S] void exec_kernel_one_bh(const PlainTensor& query, const PlainTensor& present_key, const PlainTensor& present_value, const PlainTensor& output_emb, const int32_t* block_table, size_t ithr, size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { - if (_fastpath_valid) { + if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) { auto block_number = block_table[i]; for (size_t pq = 0; pq < q_len; pq++) { for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - (*_gemv)(query.ptr(h, pq), present_key.ptr(block_number, hk), + (*_gemv)(query.ptr(h, pq), present_key.ptr(block_number, hk), _weight.ptr(ithr, h, pq) + pk); } } @@ -1128,11 +1142,11 @@ struct MHAHelper { auto pk = pk_in_blocks * _block_size; if (pk < context_len) { auto block_number = block_indices.ptr()[block_indices_begins.ptr()[b] + pk_in_blocks]; - if (_fastpath_valid) { + if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pq = 0; pq < q_len; pq++) { for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - (*_gemv)(query.ptr(b, h, pq), present_key.ptr(block_number, hk), + (*_gemv)(query.ptr(b, h, pq), present_key.ptr(block_number, hk), _weight_bhl.ptr(b, h, pq) + pk); } } @@ -1334,7 +1348,7 @@ struct MHA { const PlainTensor& alibi_slopes) { auto Hk = v_cache.m_dims[1]; - constexpr bool q_is_bf16 = precision_of::value == ov::element::bf16; + constexpr bool q_is_xf16 = one_of(precision_of::value, ov::element::bf16, ov::element::f16); constexpr bool q_cache_is_same = precision_of::value == precision_of::value; auto attn_work_count = _workitems.attn_work_size(); auto reorder_work_count = _workitems.reorder_work_size(); @@ -1360,7 +1374,7 @@ struct MHA { _helper._output.template ptr(ithr), _helper._block_size, _helper._S, _helper._block_size, _helper._S); - if (q_is_bf16) { + if (q_is_xf16) { pack_32Nx16K(_helper._wv_scratch_b.template ptr(batch_in_reorder, kv_block, hk), v_ptr, _helper._output.template ptr(ithr), @@ -1604,6 +1618,17 @@ std::shared_ptr make_pa_executor(ov::element::Type data_ } #else OPENVINO_THROW("make_pa_executor: bf16 needs avx512+ hardware."); +#endif + } else if (data_type == ov::element::f16) { +#if defined(HAVE_AVX512F) + if (kvcache_type == ov::element::u8) { + executor = std::make_shared>(); + } else { + OPENVINO_ASSERT(kvcache_type == ov::element::f16, "expect kvcache type f16, current: ", kvcache_type); + executor = std::make_shared>(); + } +#else + OPENVINO_THROW("make_pa_executor: f16 needs avx512+ hardware."); #endif } else if (data_type == ov::element::f32) { if (kvcache_type == ov::element::u8) { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp index 63a8a0f7d24062..70723a577b0c2b 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp @@ -10,6 +10,7 @@ #include #include "openvino/core/type/bfloat16.hpp" +#include "openvino/core/type/float16.hpp" #include "openvino/core/parallel.hpp" #include "executor_pa_common.hpp" #include "utils/plain_tensor.hpp" @@ -57,7 +58,8 @@ void TileConfiger::generate() { ret(); } -JitMatMulVecAMX::JitMatMulVecAMX(int head_size, int block_size) : jit_generator(jit_name()), m_head_size(head_size), m_block_size(block_size) { +JitMatMulVecAMX::JitMatMulVecAMX(int head_size, int block_size, ov::element::Type amx_prec) : + jit_generator(jit_name()), m_head_size(head_size), m_block_size(block_size), m_amx_prec(amx_prec) { create_kernel(); m_tile_cfg.reset(1, 0, @@ -98,7 +100,11 @@ void JitMatMulVecAMX::generate() { tilezero(tmmC); for (int i = 0; i < num_B_tiles; i++) { tileloadd(tmmA, ptr[reg_k_addr + reg_stride_A + i * 64]); - tdpbf16ps(tmmC, tmmA, Xbyak::Tmm(tmmB0.getIdx() + i)); + if (m_amx_prec == ov::element::bf16) { + tdpbf16ps(tmmC, tmmA, Xbyak::Tmm(tmmB0.getIdx() + i)); + } else if (m_amx_prec == ov::element::f16) { + tdpfp16ps(tmmC, tmmA, Xbyak::Tmm(tmmB0.getIdx() + i)); + } } tilestored(ptr[reg_dst_addr + reg_stride_BC + m * sizeof(float)], tmmC); add(reg_k_addr, m_head_size * 2 * 16); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp index 237860ec692e76..bc21457a3285b4 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp @@ -69,9 +69,10 @@ class JitMatMulVecAMX : public dnnl::impl::cpu::x64::jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(JitMatMulVecAMX) int m_head_size; int m_block_size; + ov::element::Type m_amx_prec; TileConfiger m_tile_configer; TileConfig m_tile_cfg; - JitMatMulVecAMX(int head_size, int block_size); + JitMatMulVecAMX(int head_size, int block_size, ov::element::Type amx_prec); void tile_config() { m_tile_configer(&m_tile_cfg); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 3ce275d47e3d9d..0670c744a6da91 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -841,20 +841,20 @@ static void attn_reduce(ov::float16* dst, ov::float16* temp, size_t M, size_t S, template static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, - const ov::intel_cpu::PlainTensor& present_key, - const ov::intel_cpu::PlainTensor& present_value, - const ov::intel_cpu::PlainTensor& alibi_mask, - const ov::intel_cpu::PlainTensor& attention_mask, - const ov::intel_cpu::PlainTensor& beams, - ov::intel_cpu::PlainTensor& output_emb, - ov::intel_cpu::PlainTensor& buf_attn_w, - ov::intel_cpu::PlainTensor& buf_attn_score, - bool has_out_transpose, - bool auto_causal, - float d_scale, - const ov::intel_cpu::PlainTensor& past_k_scale_zp, - const ov::intel_cpu::PlainTensor& past_v_scale_zp, - ov::intel_cpu::PlainTensor& head_sum) { + const ov::intel_cpu::PlainTensor& present_key, + const ov::intel_cpu::PlainTensor& present_value, + const ov::intel_cpu::PlainTensor& alibi_mask, + const ov::intel_cpu::PlainTensor& attention_mask, + const ov::intel_cpu::PlainTensor& beams, + ov::intel_cpu::PlainTensor& output_emb, + ov::intel_cpu::PlainTensor& buf_attn_w, + ov::intel_cpu::PlainTensor& buf_attn_score, + bool has_out_transpose, + bool auto_causal, + float d_scale, + const ov::intel_cpu::PlainTensor& past_k_scale_zp, + const ov::intel_cpu::PlainTensor& past_v_scale_zp, + ov::intel_cpu::PlainTensor& head_sum) { ov::intel_cpu::PlainTensor causal_mask; bool select_nfltmax_at_0 = false; auto B = query.size(0); @@ -976,16 +976,16 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, attn_mask_ptr = reinterpret_cast(&attention_mask.at({b, h, pq, 0}, true)); uint8_t* cmask_ptr = causal_mask ? &causal_mask.at({b, h, pq, 0}, true) : nullptr; attn_softmax_kernel(buf_attn_w.ptr(b, h, pq), - buf_attn_w.ptr(b, h, pq), - d_scale, - alibi_ptr, - attn_mask_ptr, - cmask_ptr, - select_nfltmax_at_0, - ncausal, - cur_kv_len, - attn_mask_prec, - precision); + buf_attn_w.ptr(b, h, pq), + d_scale, + alibi_ptr, + attn_mask_ptr, + cmask_ptr, + select_nfltmax_at_0, + ncausal, + cur_kv_len, + attn_mask_prec, + precision); }); // attn_w * V @@ -1054,11 +1054,11 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, for (size_t pq = 0; pq < q_len; pq++) { for (size_t h = h_group * h_each_group_len; h < (h_group + 1) * h_each_group_len; h++) { attn_acc_value(buf_attn_score.ptr(ithr, b, pq, h), - buf_attn_w.ptr(b, h, pq)[pv], - v, - S, - p + 0, - p + 1); + buf_attn_w.ptr(b, h, pq)[pv], + v, + S, + p + 0, + p + 1); } } parallel_it_step(pv, kv_len, b, B, h_group, h_group_num); @@ -1093,86 +1093,36 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query, if (query.get_precision() == ov::element::bf16) { if (present_key.get_precision() == ov::element::u8) { mha_single_token_kernel(query, - present_key, - present_value, - alibi_mask, - attention_mask, - beams, - output_emb, - buf_attn_w, - buf_attn_score, - has_out_transpose, - auto_causal, - d_scale, - past_k_scale_zp, - past_v_scale_zp, - head_sum); + present_key, + present_value, + alibi_mask, + attention_mask, + beams, + output_emb, + buf_attn_w, + buf_attn_score, + has_out_transpose, + auto_causal, + d_scale, + past_k_scale_zp, + past_v_scale_zp, + head_sum); } else { mha_single_token_kernel(query, - present_key, - present_value, - alibi_mask, - attention_mask, - beams, - output_emb, - buf_attn_w, - buf_attn_score, - has_out_transpose, - auto_causal, - d_scale, - past_k_scale_zp, - past_v_scale_zp, - head_sum); - } - } else if (query.get_precision() == ov::element::f32) { - if (present_key.get_precision() == ov::element::u8) { - mha_single_token_kernel(query, - present_key, - present_value, - alibi_mask, - attention_mask, - beams, - output_emb, - buf_attn_w, - buf_attn_score, - has_out_transpose, - auto_causal, - d_scale, - past_k_scale_zp, - past_v_scale_zp, - head_sum); - } else if (present_key.get_precision() == ov::element::f16) { - mha_single_token_kernel(query, - present_key, - present_value, - alibi_mask, - attention_mask, - beams, - output_emb, - buf_attn_w, - buf_attn_score, - has_out_transpose, - auto_causal, - d_scale, - past_k_scale_zp, - past_v_scale_zp, - head_sum); - } else { - mha_single_token_kernel(query, - present_key, - present_value, - alibi_mask, - attention_mask, - beams, - output_emb, - buf_attn_w, - buf_attn_score, - has_out_transpose, - auto_causal, - d_scale, - past_k_scale_zp, - past_v_scale_zp, - head_sum); + present_key, + present_value, + alibi_mask, + attention_mask, + beams, + output_emb, + buf_attn_w, + buf_attn_score, + has_out_transpose, + auto_causal, + d_scale, + past_k_scale_zp, + past_v_scale_zp, + head_sum); } } else if (query.get_precision() == ov::element::f16) { #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) @@ -1196,8 +1146,90 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query, OPENVINO_THROW("Unsupported precision: ", query.get_precision()); } #else - OPENVINO_THROW("Unsupported precision: ", query.get_precision()); + if (present_key.get_precision() == ov::element::u8) { + mha_single_token_kernel(query, + present_key, + present_value, + alibi_mask, + attention_mask, + beams, + output_emb, + buf_attn_w, + buf_attn_score, + has_out_transpose, + auto_causal, + d_scale, + past_k_scale_zp, + past_v_scale_zp, + head_sum); + } else { + mha_single_token_kernel(query, + present_key, + present_value, + alibi_mask, + attention_mask, + beams, + output_emb, + buf_attn_w, + buf_attn_score, + has_out_transpose, + auto_causal, + d_scale, + past_k_scale_zp, + past_v_scale_zp, + head_sum); + } #endif + } else if (query.get_precision() == ov::element::f32) { + if (present_key.get_precision() == ov::element::u8) { + mha_single_token_kernel(query, + present_key, + present_value, + alibi_mask, + attention_mask, + beams, + output_emb, + buf_attn_w, + buf_attn_score, + has_out_transpose, + auto_causal, + d_scale, + past_k_scale_zp, + past_v_scale_zp, + head_sum); + } else if (present_key.get_precision() == ov::element::f16) { + mha_single_token_kernel(query, + present_key, + present_value, + alibi_mask, + attention_mask, + beams, + output_emb, + buf_attn_w, + buf_attn_score, + has_out_transpose, + auto_causal, + d_scale, + past_k_scale_zp, + past_v_scale_zp, + head_sum); + } else { + mha_single_token_kernel(query, + present_key, + present_value, + alibi_mask, + attention_mask, + beams, + output_emb, + buf_attn_w, + buf_attn_score, + has_out_transpose, + auto_causal, + d_scale, + past_k_scale_zp, + past_v_scale_zp, + head_sum); + } } else { OPENVINO_THROW("Unsupported precision: ", query.get_precision()); } @@ -1205,4 +1237,4 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query, } // namespace XARCH } // namespace Cpu } // namespace Extensions -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp index bffe0ee3761dd5..60c6a24ec5f2fa 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp @@ -751,14 +751,14 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_ while (i + vec_len_f32_avx512 <= size) { v_a = _mm512_loadu_ps(a + i); v_a = _mm512_mul_ps(v_a, v_scale); - _mm512_storeu_ps(a_dst + i, v_a); + mm512_uni_storeu_ps(a_dst + i, v_a); i += vec_len_f32_avx512; } if (i < size) { __mmask16 mask = (1 << (size - i)) - 1; v_a = _mm512_maskz_loadu_ps(mask, a + i); v_a = _mm512_mul_ps(v_a, v_scale); - _mm512_mask_storeu_ps(a_dst + i, mask, v_a); + mm512_uni_storeu_tail_ps(a_dst + i, v_a, size - i); i += (size - i); } @@ -768,14 +768,14 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_ while (i + vec_len_f32_avx2 <= size) { v_a = _mm256_loadu_ps(a + i); v_a = _mm256_mul_ps(v_a, v_scale); - _mm256_storeu_ps(a_dst + i, v_a); + mm256_uni_storeu_ps(a_dst + i, v_a); i += vec_len_f32_avx2; } if (i < size) { auto mask = get_mask(size - i); v_a = _mm256_maskload_ps(a + i, mask); v_a = _mm256_mul_ps(v_a, v_scale); - _mm256_maskstore_ps(a_dst + i, mask, v_a); + mm256_uni_storeu_tail_ps(a_dst + i, v_a, size - i); i += (size - i); } @@ -793,11 +793,12 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_ } } -inline void multiply_scalar(float* a, ov::bfloat16* a_dst, const float val, const size_t size) { +template::value || std::is_same::value), bool>::type> +inline void multiply_scalar(float* a, T* a_dst, const float val, const size_t size) { + size_t i = 0; #if defined(HAVE_AVX512F) auto v_scale = _mm512_set1_ps(val); __m512 v_a = {0}; - size_t i = 0; while (i + vec_len_f32_avx512 <= size) { v_a = _mm512_loadu_ps(a + i); v_a = _mm512_mul_ps(v_a, v_scale); @@ -808,10 +809,12 @@ inline void multiply_scalar(float* a, ov::bfloat16* a_dst, const float val, cons __mmask16 mask = (1 << (size - i)) - 1; v_a = _mm512_maskz_loadu_ps(mask, a + i); v_a = _mm512_mul_ps(v_a, v_scale); - mm512_uni_mask_storeu_ps(a_dst + i, mask, v_a); + mm512_uni_storeu_tail_ps(a_dst + i, v_a, size - i); + + i += (size - i); } #else - for (size_t i = 0; i < size; i++) { + for (; i < size; i++) { a_dst[i] = a[i] * val; } #endif @@ -898,6 +901,7 @@ inline void attn_softmax_kernel(float* a, float alibi_slope) { using func_fp32_type = void (*)(float*, float, const float*, const float*, const uint8_t*, bool, size_t, float, float&); using func_bf16_type = void (*)(float*, float, const float*, const ov::bfloat16*, const uint8_t*, bool, size_t, float, float&); + using func_f16_type = void (*)(float*, float, const float*, const ov::float16*, const uint8_t*, bool, size_t, float, float&); static constexpr func_fp32_type funcs_fp32[] = { scale_add2_reduce_max, scale_add2_reduce_max, @@ -918,12 +922,24 @@ inline void attn_softmax_kernel(float* a, scale_add2_reduce_max, scale_add2_reduce_max }; + static constexpr func_f16_type funcs_f16[] = { + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max + }; int dispatch = (alibi ? 0b100 : 0) | (attn_mask ? 0b010 : 0) | (causal_mask ? 0b001 : 0); float max = std::numeric_limits::lowest(); if (attn_mask_prec == ov::element::f32) { funcs_fp32[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); - } else { + } else if (attn_mask_prec == ov::element::bf16) { funcs_bf16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + } else { + funcs_f16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); } float sum = 0.0f; @@ -936,11 +952,16 @@ inline void attn_softmax_kernel(float* a, // apply causual mask to final result instead of attn_score if (total_size > len) memset(static_cast(a_dst) + len, 0, sizeof(float) * (total_size - len)); - } else { + } else if (dst_precision == ov::element::bf16) { multiply_scalar(a, static_cast(a_dst), scalar, len); // apply causual mask to final result instead of attn_score if (total_size > len) memset(static_cast(a_dst) + len, 0, sizeof(ov::bfloat16) * (total_size - len)); + } else { + multiply_scalar(a, static_cast(a_dst), scalar, len); + // apply causual mask to final result instead of attn_score + if (total_size > len) + memset(static_cast(a_dst) + len, 0, sizeof(ov::float16) * (total_size - len)); } } #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) @@ -1022,4 +1043,4 @@ inline void attn_softmax_kernel(ov::float16* a, } // namespace XARCH } // namespace Cpu } // namespace Extensions -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp index e729fac66dd257..2895a272b982b5 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp @@ -38,32 +38,54 @@ BrgemmKernel::BrgemmKernel(size_t M, // blocking M M_blk = matmulOptimalM; M_tail = M % M_blk; - brgVnniFactor = 4 / inType.size(); - if (inType != ov::element::bf16 && inType != ov::element::f32) - THROW_ERROR("brgemm kernel only supports bf16, f32"); + if (!one_of(inType, ov::element::bf16, ov::element::f16, ov::element::f32)) + THROW_ERROR("brgemm kernel only supports f16, bf16, f32"); + bool is_f32 = inType == ov::element::f32; + bool is_bf16 = inType == ov::element::bf16; if (is_bf16 && !mayiuse(avx512_core_bf16)) THROW_ERROR("brgemm bf16 kernel could only be used above avx512_bf16"); - bool isAMXSupported = is_bf16 && mayiuse(avx512_core_amx); + bool is_f16 = inType == ov::element::f16; + if (is_f16 && !mayiuse(avx512_core_fp16)) + THROW_ERROR("brgemm f16 kernel could only be used above avx512_f16"); + + srcType = weiType = inType; + // If isa is avx512_core_fp16, f16 is supported by upconverted to f32 + is_avx_f16_only = inType == ov::element::f16 && mayiuse(avx512_core_fp16) && !mayiuse(avx512_core_amx_fp16); + if (is_avx_f16_only) { + srcType = ov::element::f32; + weiType = ov::element::f32; + } + brgVnniFactor = 4 / weiType.size(); + + /* + AVX AMX + fp32 Y N + bf16 Y Y + fp16 Y Y + */ + bool isAMXSupported = (is_bf16 && mayiuse(avx512_core_amx)) || (is_f16 && mayiuse(avx512_core_amx_fp16)); + bool isBrgWithAMX = isAMXSupported && !is_avx_f16_only; + size_t vlen; if (mayiuse(avx512_core)) vlen = cpu_isa_traits::vlen; else vlen = cpu_isa_traits::vlen; // blocking N - N_blk = is_bf16 ? 32 : std::max(N, vlen / inType.size()); + N_blk = !is_f32 ? 32 : std::max(N, vlen / inType.size()); N_tail = N % N_blk; // blocking K - K_blk = isAMXSupported ? 32 : K; + K_blk = isBrgWithAMX ? 32 : K; K_tail = K % K_blk; - if (isAMXSupported && K_tail) { + if (isBrgWithAMX && K_tail) { K_tail = rnd_up(K_tail, 2); } // copied K must be round up by vlen / inType.size(), otherwise copy B kernel may access wrong memory - packedBSize = rnd_up(K, vlen / inType.size()) * rnd_up(N, N_blk) * inType.size(); + packedBSize = rnd_up(K, vlen / weiType.size()) * rnd_up(N, N_blk) * weiType.size(); size_t brg0BaseIdx = std::numeric_limits::max(); for (size_t m = 0; m < 2; m++) { for (size_t k = 0; k < 2; k++) { @@ -78,18 +100,18 @@ BrgemmKernel::BrgemmKernel(size_t M, brgemmCtx.M = M_; brgemmCtx.N = N_; brgemmCtx.K = K_; - brgemmCtx.LDA = k ? K_blk : lda; - brgemmCtx.LDB = (is_bf16 || b_transposed) ? rnd_up(N, N_blk) : ldb; // bf16/b_transposed needs copy + brgemmCtx.LDA = k ? K_blk : (is_avx_f16_only ? K : lda); // f16 use f32 internally + brgemmCtx.LDB = (!is_f32 || b_transposed) ? rnd_up(N, N_blk) : ldb; // bf16/fp16/b_transposed needs copy brgemmCtx.LDC = ldc; - brgemmCtx.dt_in0 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(inType)); - brgemmCtx.dt_in1 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(inType)); + brgemmCtx.dt_in0 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(srcType)); + brgemmCtx.dt_in1 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(weiType)); brgemmCtx.beta = beta; // don't create brgemm kernels for empty tiles if (M_ != 0 && K_ != 0 && N_ != 0) { if (brg0BaseIdx == std::numeric_limits::max()) brg0BaseIdx = getBrgIdx(m, k, n); - init_brgemm(brgemmCtx, brgKernels[getBrgIdx(m, k, n)], isAMXSupported); + init_brgemm(brgemmCtx, brgKernels[getBrgIdx(m, k, n)], isBrgWithAMX); } } } @@ -97,12 +119,19 @@ BrgemmKernel::BrgemmKernel(size_t M, auto& brgemmCtx0 = brgCtxs[brg0BaseIdx]; - if (brgemmCtx0.is_with_amx && K_tail) { - init_brgemm_copy_a(brgCopyAKernel, K, K_blk, K_tail, K_blk, brgemmCtx0.dt_in0, false, lda * inType.size()); - packedASize = M_blk * rnd_up(K, K_blk) * inType.size(); + if ((brgemmCtx0.is_with_amx && K_tail) || is_avx_f16_only) { + init_brgemm_copy_a(brgCopyAKernel, + K, + K_blk, + K_tail, + is_avx_f16_only ? K : K_blk, + brgemmCtx0.dt_in0, + false, + lda * inType.size()); + packedASize = M_blk * rnd_up(K, brgemmCtx0.LDA) * srcType.size(); } - if (brgemmCtx0.is_with_amx || inType == ov::element::bf16 || b_transposed) { + if (brgemmCtx0.is_with_amx || !is_f32 || b_transposed) { size_t b_stride = 0; b_stride = ldb * inType.size(); // K should use the original K @@ -136,10 +165,20 @@ void BrgemmKernel::init_brgemm(brgemmCtx& ctx, const bool is_int8 = one_of(ctx.dt_in0, data_type::u8, data_type::s8) && one_of(ctx.dt_in1, data_type::u8, data_type::s8); cpu_isa_t isa; - if (mayiuse(avx512_core)) { - isa = use_amx ? isa_undef - : ctx.dt_in0 == dnnl_data_type_t::dnnl_bf16 ? avx512_core_bf16 - : (is_int8 ? avx512_core_vnni : avx512_core); + if (use_amx) { + isa = isa_undef; + } else if (mayiuse(avx512_core)) { + if (ctx.dt_in0 == dnnl_data_type_t::dnnl_bf16 && mayiuse(avx512_core_bf16)) { + isa = avx512_core_bf16; + } else if (ctx.dt_in0 == dnnl_data_type_t::dnnl_f16 && mayiuse(avx512_core_fp16)) { + isa = avx512_core_fp16; + } else { + if (is_int8) { + isa = avx512_core_vnni; + } else { + isa = avx512_core; + } + } } else { isa = cpu_isa_t::avx2; } @@ -161,7 +200,7 @@ void BrgemmKernel::init_brgemm(brgemmCtx& ctx, ctx.K, nullptr); if (status != dnnl_success) { - THROW_ERROR("cannot be executed due to invalid brgconv params"); + THROW_ERROR("cannot be executed due to invalid brgemm params"); } if (use_amx && b_accumulate) { @@ -193,6 +232,7 @@ void BrgemmKernel::init_brgemm(brgemmCtx& ctx, } brgKernel.reset(brgKernel_); } + void BrgemmKernel::init_brgemm_copy_a( std::unique_ptr& brgCopyKernel, size_t K, @@ -214,13 +254,15 @@ void BrgemmKernel::init_brgemm_copy_a( brgCopyKernelConf.s8s8_compensation_required = false; brgCopyKernelConf.wei_zp_type = dnnl::impl::cpu::x64::none; brgCopyKernelConf.src_zp_type = dnnl::impl::cpu::x64::none; - brgCopyKernelConf.src_dt = dt_in0; + brgCopyKernelConf.src_dt = is_avx_f16_only ? dnnl_data_type_t::dnnl_f32 : dt_in0; brgCopyKernelConf.copy_A_src_stride = copy_A_src_stride; - brgCopyKernelConf.a_dt_sz = DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); + // copy_a_kernel assumes that in/out tensor has same data type except f16 + // copy_a_kernel has special path for f16: assuming input(f16) -> output(f32) + brgCopyKernelConf.a_dt_sz = is_avx_f16_only ? sizeof(ov::float16) : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); // copied A has the same precision of original - brgCopyKernelConf.tr_a_dt_sz = DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); + brgCopyKernelConf.tr_a_dt_sz = is_avx_f16_only ? sizeof(float) : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); brgCopyKernelConf.transposed_A = transpose; - brgCopyKernelConf.isa = avx512_core_amx; + brgCopyKernelConf.isa = is_avx_f16_only ? avx512_core_fp16 : avx512_core_amx; create_brgemm_matmul_copy_a(brgCopyKernel, &brgCopyKernelConf); } @@ -238,8 +280,8 @@ void BrgemmKernel::init_brgemm_copy_b( bool transpose, size_t copy_B_wei_stride) { brgemm_matmul_conf_t brgCopyKernelConf; - brgCopyKernelConf.src_dt = dt_in0; - brgCopyKernelConf.wei_dt = dt_in1; + brgCopyKernelConf.src_dt = is_avx_f16_only ? dnnl_data_type_t::dnnl_f32 : dt_in0; + brgCopyKernelConf.wei_dt = is_avx_f16_only ? dnnl_data_type_t::dnnl_f32 : dt_in1; brgCopyKernelConf.orig_wei_dt = dt_in1; brgCopyKernelConf.wei_n_blk = N_blk; brgCopyKernelConf.wei_tag = transpose ? dnnl_ba : dnnl_ab; @@ -255,17 +297,23 @@ void BrgemmKernel::init_brgemm_copy_b( brgCopyKernelConf.K_blk = K; brgCopyKernelConf.K_tail = 0; brgCopyKernelConf.N_chunk_elems = brgCopyKernelConf.N_blk; - brgCopyKernelConf.b_dt_sz = + // f16 is computed by upconverting. in(f16) -> out(f32) + brgCopyKernelConf.b_dt_sz = is_avx_f16_only ? sizeof(ov::float16) : DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); - brgCopyKernelConf.tr_b_dt_sz = + brgCopyKernelConf.tr_b_dt_sz = is_avx_f16_only ? sizeof(float) : DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); brgCopyKernelConf.req_wei_vnni_downconvert = false; if (is_with_amx) { - brgCopyKernelConf.isa = avx512_core_amx; + brgCopyKernelConf.isa = dt_in0 == dnnl_data_type_t::dnnl_f16 ? avx512_core_amx_fp16 : avx512_core_amx; brgCopyKernelConf.s8s8_compensation_required = false; } else { - brgCopyKernelConf.isa = dt_in0 == dnnl_data_type_t::dnnl_bf16 ? avx512_core_bf16 : avx512_core_vnni; + if (inType == ov::element::f16) { + brgCopyKernelConf.isa = mayiuse(avx512_core_fp16) ? avx512_core_fp16 : avx2_vnni_2; + } else { + brgCopyKernelConf.isa = dt_in0 == dnnl_data_type_t::dnnl_bf16 ? avx512_core_bf16 : avx512_core_vnni; + } + brgCopyKernelConf.s8s8_compensation_required = false; } brgCopyKernelConf.has_zero_point_a = false; @@ -283,7 +331,7 @@ void BrgemmKernel::copy_buffer_b(void* b, void* scratch_b) { for (size_t nb = 0; nb < div_up(N, N_blk); nb++) { auto N_stride = b_transposed ? ldb : 1; auto pCopyKernel0In = ptr_b + nb * N_blk * inType.size() * N_stride; - auto pCopyKernel0Out = ptr_scartch_b + nb * N_blk * brgVnniFactor * inType.size(); + auto pCopyKernel0Out = ptr_scartch_b + nb * N_blk * brgVnniFactor * weiType.size(); auto ctx = jit_brgemm_matmul_copy_b_t::ctx_t(); @@ -306,15 +354,13 @@ void BrgemmKernel::executeGemm(bool is_M_tail, void* a, void* b, void* c, void* auto ptr_C = reinterpret_cast(c); auto ptr_scartch_a = reinterpret_cast(scratch_a); auto ptr_scartch_b = reinterpret_cast(b); - uint8_t* ptr_a_tail = nullptr; size_t brgIdx0 = getBrgIdx(0, 0, 0); // The step for matrix A over main K dimension size_t K0_step0 = brgCtxs[brgIdx0].K; auto cur_M_blk = is_M_tail ? M_tail : M_blk; if (brgCopyAKernel) { - // only copy tailed data; - size_t K_offset = K < K_blk ? 0 : K0_step0 * inType.size(); + size_t K_offset = is_avx_f16_only ? 0 : (K < K_blk ? 0 : K0_step0 * srcType.size()); auto pCopyKernelIn = ptr_A + K_offset; auto pCopyKernelOut = ptr_scartch_a; @@ -331,8 +377,6 @@ void BrgemmKernel::executeGemm(bool is_M_tail, void* a, void* b, void* c, void* ctx.current_K_blk = K % K_blk; (*brgCopyAKernel)(&ctx); - - ptr_a_tail = pCopyKernelOut; } size_t count_N = 0; for (size_t n = 0; n < 2; n++) { @@ -341,17 +385,17 @@ void BrgemmKernel::executeGemm(bool is_M_tail, void* a, void* b, void* c, void* size_t mIdx = is_M_tail ? 1 : 0; auto& brgemmCtx = brgCtxs[getBrgIdx(mIdx, k, n)]; if (brgemmCtx.K != 0 && brgemmCtx.N != 0 && brgemmCtx.M != 0) { - auto local_a_ptr = k > 0 ? ptr_a_tail : ptr_A; - auto B_stride = (k * count_K + n * count_N * brgVnniFactor) * inType.size(); + auto local_a_ptr = is_avx_f16_only ? ptr_scartch_a : (k > 0 ? ptr_scartch_a : ptr_A); + auto B_stride = (k * count_K + n * count_N * brgVnniFactor) * weiType.size(); auto weight_ptr = ptr_scartch_b + B_stride; auto C_stride = n * count_N * ov::element::f32.size(); auto out_ptr = ptr_C + C_stride; callBrgemm(brgemmCtx, - brgKernels[getBrgIdx(mIdx, k, n)], - local_a_ptr, - weight_ptr, - out_ptr, - wsp); + brgKernels[getBrgIdx(mIdx, k, n)], + local_a_ptr, + weight_ptr, + out_ptr, + wsp); // stride K, N if body kernel is executed. if (k == 0) { count_K = brgemmCtx.K * brgemmCtx.LDB; @@ -373,17 +417,17 @@ void BrgemmKernel::executeGemm(void* a, void* b, void* c, void* wsp, void* scrat for (size_t mb = 0; mb < div_up(M, M_blk); mb++) { const bool is_M_tail = (M - mb * M_blk < M_blk); - auto ptr_a = ptr_A + (mb * M_blk * lda) * inType.size(); + auto ptr_a = ptr_A + (mb * M_blk * lda) * srcType.size(); auto ptr_c = ptr_C + (mb * M_blk * ldc) * ov::element::f32.size(); executeGemm(is_M_tail, ptr_a, scratch_b, wsp, ptr_c, scratch_a); } } void BrgemmKernel::callBrgemm(brgemmCtx& ctx, - std::unique_ptr& brgKernel, - const void* pin0, - const void* pin1, - void* pout, - void* wsp) { + std::unique_ptr& brgKernel, + const void* pin0, + const void* pin1, + void* pout, + void* wsp) { if (ctx.is_with_amx) amx_tile_configure(ctx.palette); if (ctx.is_with_comp) { @@ -398,4 +442,4 @@ void BrgemmKernel::callBrgemm(brgemmCtx& ctx, } } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.hpp index 513b484ab0b963..38384f2aceae83 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.hpp @@ -59,6 +59,9 @@ class BrgemmKernel { size_t packedBSize = 0; size_t packedASize = 0; ov::element::Type inType; + ov::element::Type weiType; + ov::element::Type srcType; + bool is_avx_f16_only = false; bool b_accumulate = false; static constexpr size_t MHA_BRGEMM_KERNELS_NUM = 8; static constexpr size_t matmulOptimalM = 32; diff --git a/src/plugins/intel_cpu/src/nodes/matmul.cpp b/src/plugins/intel_cpu/src/nodes/matmul.cpp index 50cb3353612996..92d8f356728ed9 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.cpp +++ b/src/plugins/intel_cpu/src/nodes/matmul.cpp @@ -543,6 +543,20 @@ void MatMul::prepareParams() { if (!src0MemPtr || !src0MemPtr->isDefined() || !src1MemPtr || !src1MemPtr->isDefined()) OPENVINO_THROW(errorPrefix, " has undefined input memory"); + // check for a degenerate case. In this context the degenerate case is a matrix multiplication where the + // collapsing dimension is zero, e.g., AB=C, where A has the shape [10, 0] and B has the shape [0, 20], + // consequently C has shape [10, 20]. In this scenario C is a null matrix (a matrix filled with zeroes) + // according to the empty sum convention. + if (src0MemPtr->getDesc().getShape().hasZeroDims() && src0MemPtr->getDesc().getShape().hasZeroDims() && + !dstMemPtr->getDesc().getShape().hasZeroDims()) { + // todo: obviously we need a special executor that would process fused ops providing a correct result + OPENVINO_ASSERT(!withBiases && fusedWith.empty(), + "Matmul doesn't support a degenerate case when other ops are fused"); + //reset executor + execPtr.reset(); + return; + } + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) OPENVINO_THROW(errorPrefix, " did not set preferable primitive descriptor"); @@ -646,6 +660,9 @@ void MatMul::prepareParams() { void MatMul::execute(dnnl::stream strm) { if (execPtr) { execPtr->exec(primArgs, strm); + } else if (hasEmptyInputTensors()) { + // this is a degenerate case, fill output with zeroes + getDstMemoryAtPort(0)->nullify(); } else { OPENVINO_THROW(errorPrefix, " doesn't have an initialized executor"); } @@ -691,6 +708,10 @@ const std::vector& MatMul::getDefaultImplPriority() { return priorities; } +bool MatMul::isExecutable() const { + return !hasEmptyOutputTensors(); +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/matmul.h b/src/plugins/intel_cpu/src/nodes/matmul.h index 7b8f064e17260b..2e487148d0ec0c 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.h +++ b/src/plugins/intel_cpu/src/nodes/matmul.h @@ -43,6 +43,8 @@ class MatMul : public Node { const std::vector& getDefaultImplPriority() override; bool canBeExecutedInInt8() const override; + bool isExecutable() const override; + protected: AttrPtr initPrimitiveAttr() override; AttrPtr initPrimitiveAttr(const VectorDims& dims); diff --git a/src/plugins/intel_cpu/src/nodes/memory.cpp b/src/plugins/intel_cpu/src/nodes/memory.cpp index e66b148c6f99ee..88693ebfa49fdf 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.cpp +++ b/src/plugins/intel_cpu/src/nodes/memory.cpp @@ -300,21 +300,27 @@ void MemoryOutput::runStatic(dnnl::stream strm) { void MemoryOutput::runDynamic(dnnl::stream strm) { //first we have to resize the output memory auto inputMem = getSrcMemoryAtPort(0); - const auto& newDims = inputMem->getStaticDims(); - OPENVINO_ASSERT(extMemDesc, - "MemoryOutput ", - getName(), - " uninitialized assigned memory"); - - auto newExternDesc = extMemDesc->cloneWithNewDims(newDims); OPENVINO_ASSERT(assignedMem, "MemoryOutput ", getName(), " uninitialized assigned memory"); - assignedMem->redefineDesc(newExternDesc); - runStatic(strm); + const auto& newShape = inputMem->getShape(); + const auto& stateShape = assignedMem->getShape(); + + if (stateShape.isDynamic() || stateShape.getStaticDims() != newShape.getStaticDims()) { + OPENVINO_ASSERT(extMemDesc, + "MemoryOutput ", + getName(), + " uninitialized assigned memory"); + auto newExternDesc = extMemDesc->cloneWithNewDims(newShape.getStaticDims()); + assignedMem->redefineDesc(newExternDesc); + } + + if (!newShape.hasZeroDims()) { // no need to copy data for empty tensor + runStatic(strm); + } } bool MemoryOutputStub::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { @@ -413,7 +419,7 @@ MemoryInputBase::~MemoryInputBase() { } MemoryOutputBase& MemoryInputBase::getOutputNode() { - OPENVINO_ASSERT(outputNode, "MemoryOutput ", getName(), " doesn't have sibling input"); + OPENVINO_ASSERT(outputNode, "MemoryInput ", getName(), " doesn't have sibling output"); return *outputNode; } @@ -593,31 +599,44 @@ void MemoryInput::runDynamic(dnnl::stream strm) { getName(), " assigned state has null memory ptr"); - // check whether we can share memory block - const auto& stateDims = assignedMem->getStaticDims(); - const bool hasZeroDims = std::count(std::begin(stateDims), std::end(stateDims), 0) > 0; - auto internDesc = getBaseMemDescAtOutputPort(0)->cloneWithNewDims(stateDims, hasZeroDims); - OPENVINO_ASSERT(memBlock, "MemoryInput ", getName(), " has uninitialized memory block."); + // check whether we can share memory block + const auto& shape = assignedMem->getShape(); + const bool hasZeroDims = shape.hasZeroDims(); + const bool processInitGraph = needInitGraphProcessing(); + const auto& stateDims = shape.getStaticDims(); + + if (hasZeroDims && !processInitGraph) { + // fast track as we don't really need to share memory and transfer any data for empty tensors + memBlock->reset(); + redefineOutputMemory(0, stateDims); + return; + } + + auto dst = getDstMemoryAtPort(0); + auto currentOutputDesc = dst->getDescPtr(); + + auto internDesc = currentOutputDesc->isDefined() && (currentOutputDesc->getShape().getStaticDims() == stateDims) + ? currentOutputDesc + : getBaseMemDescAtOutputPort(0)->cloneWithNewDims(stateDims, hasZeroDims); + if (internDesc->isCompatible(assignedMem->getDesc())) { memBlock->setMemBlock(assignedMem->getMemoryBlock()); } else { memBlock->reset(); } - const bool processInitGraph = needInitGraphProcessing(); //reshape output const auto& newDims = processInitGraph ? getSrcMemoryAtPort(0)->getStaticDims() : stateDims; - redefineOutputMemory({newDims}); + redefineOutputMemory(0, newDims); //copy data when necessary auto src = processInitGraph ? getSrcMemoryAtPort(0) : assignedMem; - auto dst = getDstMemoryAtPort(0); if (src->getData() != dst->getData()) { dst->load(*src); } @@ -847,6 +866,6 @@ void MemoryInputSDPA::resolveInPlaceEdges(Edge::LOOK look) { } } -} // namespace node +} // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp index 2272fa481d5471..6bf7d3099a85d9 100644 --- a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp +++ b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp @@ -190,6 +190,8 @@ ov::element::Type PagedAttention::getRuntimePrecision() const { // bf16 should be enabled only when platform supports if (rtPrecision == ov::element::bf16 && ov::with_cpu_x86_bfloat16()) { rtPrecision = ov::element::bf16; + } else if (rtPrecision == ov::element::f16 && ov::with_cpu_x86_avx512_core_fp16()) { + rtPrecision = ov::element::f16; } else { rtPrecision = ov::element::f32; } diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp index 016fa90398aa4b..eecba2acff260b 100644 --- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp +++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp @@ -261,7 +261,7 @@ struct MHAKernel { void prepare_brgemm_prim(dnnl::stream strm, PlainTensor& query, PlainTensor& present_key, bool has_out_transpose) { auto in_type = precision_of::value; - auto qkv_dt = in_type == ov::element::f32 ? dt::f32 : dt::bf16; + auto qkv_dt = DnnlExtensionUtils::ElementTypeToDataType(in_type); auto B = query.size(0); auto H = query.size(1); auto q_len = query.size(2); @@ -354,13 +354,13 @@ struct MHAKernel { size_t h_each_group_len = H / Hk; const size_t m_block_size = qk_gemm_ptr->get_mblk_size(); auto m_blocks = (q_len + m_block_size - 1) / m_block_size; - bool is_bf16 = precision_of::value == ov::element::bf16; + bool is_xf16 = precision_of::value == ov::element::bf16 || precision_of::value == ov::element::f16; // packed k, v parallel_for2d(B, Hk, [&](size_t b, size_t h) { T* k_ptr = &present_key.at({b, h, 0, 0}); T* v_ptr = &present_value.at({b, h, 0, 0}); qk_gemm_ptr->copy_buffer_b(k_ptr, &qk_scratch_b.at({b, h, 0})); - if (is_bf16) + if (is_xf16) wv_gemm_ptr->copy_buffer_b(v_ptr, &wv_scratch_b.at({b, h, 0})); }); @@ -420,12 +420,12 @@ struct MHAKernel { } auto* w_ptr = reinterpret_cast(weight_score.ptr(ithr, h, 0, 0)); float* fp32_out_ptr; - if (is_bf16) { + if (is_xf16) { fp32_out_ptr = has_out_transpose ? &fp32_out.at({b, m_start, h, 0}) : &fp32_out.at({b, h, m_start, 0}); } else { fp32_out_ptr = has_out_transpose ? &output_emb.at({b, m_start, h * head_size}) : &output_emb.at({b, h, m_start, 0}); } - T* v_ptr = is_bf16 ? &wv_scratch_b.at({b, h / h_each_group_len, 0}) + T* v_ptr = is_xf16 ? &wv_scratch_b.at({b, h / h_each_group_len, 0}) : &present_value.at({b, h / h_each_group_len, 0, 0}); wv_gemm_ptr->executeGemm(m_cnt < m_block_size, w_ptr, @@ -433,12 +433,12 @@ struct MHAKernel { fp32_out_ptr, wsp.data() + tid * wsp_size_per_thread, wv_scratch_a ? &wv_scratch_a.at({tid, 0}) : nullptr); - if (is_bf16) { + if (is_xf16) { if (has_out_transpose) { attn_memcpy2d_kernel(&fp32_out.at({b, m_start, h, 0}), &output_emb.at({b, m_start, h * head_size}), ov::element::f32, - ov::element::bf16, + precision_of::value, fp32_out.stride(1), output_emb.stride(1), head_size, @@ -447,7 +447,7 @@ struct MHAKernel { attn_memcpy2d_kernel(&fp32_out.at({b, h, m_start, 0}), &output_emb.at({b, h, m_start, 0}), ov::element::f32, - ov::element::bf16, + precision_of::value, 0, 0, m_cnt * head_size, @@ -1068,28 +1068,35 @@ void ScaledDotProductAttention::createPrimitive() { auto builder = [&](const ScaledDotProductAttentionKey& key) -> std::shared_ptr { std::shared_ptr executor = nullptr; - if (rtPrecision == ov::element::bf16) { #ifdef OPENVINO_ARCH_X86_64 + if (rtPrecision == ov::element::bf16) { executor = std::make_shared>(context); -#endif + } else if (rtPrecision == ov::element::f16) { + if (with_cpu_x86_avx512_core_fp16()) { + executor = std::make_shared>(context); + } else { + executor = std::make_shared>(context); + } } else { -#if defined(OV_CPU_WITH_ACL) - if (rtPrecision == ov::element::f16) - executor = std::make_shared>(context); - else - executor = std::make_shared>(context); -#elif defined(OV_CPU_WITH_MLAS) +#ifdef OV_CPU_WITH_MLAS executor = std::make_shared>(context); -#elif defined(OPENVINO_ARCH_X86_64) +#else if (with_cpu_x86_avx512_core()) { executor = std::make_shared>(context); } else { executor = std::make_shared>(context); } -#else - executor = std::make_shared>(context); #endif } +#elif defined(OV_CPU_WITH_ACL) + if (rtPrecision == ov::element::f16) { + executor = std::make_shared>(context); + } else { + executor = std::make_shared>(context); + } +#else + executor = std::make_shared>(context); +#endif return executor; }; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index fa1810ff6044f9..5c88772eeedabc 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -446,7 +446,7 @@ ov::Any Plugin::get_ro_property(const std::string& name, const ov::AnyMap& optio } else if (ov::internal::supported_properties == name) { return decltype(ov::internal::supported_properties)::value_type{ ov::PropertyName{ov::internal::caching_properties.name(), ov::PropertyMutability::RO}, -#if !defined(OPENVINO_ARCH_ARM) +#if !defined(OPENVINO_ARCH_ARM) && !(defined(__APPLE__) || defined(__MACOSX)) ov::PropertyName{ov::internal::caching_with_mmap.name(), ov::PropertyMutability::RO}, #endif ov::PropertyName{ov::internal::exclusive_async_requests.name(), ov::PropertyMutability::RW}, diff --git a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp index e5d87c578712f6..5db6f97bba8c02 100644 --- a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp @@ -101,6 +101,7 @@ #include "scaled_dot_product_attention_shape_inference.hpp" #include "scatter_elements_update_shape_inference.hpp" #include "scatter_nd_base_shape_inference.hpp" +#include "search_sorted_shape_inference.hpp" #include "select_shape_inference.hpp" #include "shape_nodes.hpp" #include "shuffle_channels_shape_inference.hpp" @@ -405,6 +406,7 @@ using IStaticShapeInferFactory = template <> const IStaticShapeInferFactory::TRegistry IStaticShapeInferFactory::registry{ // opset15 + _OV_OP_SHAPE_INFER_MASK_REG(op::v15::SearchSorted, ShapeInferTA, util::bit::mask()), _OV_OP_SHAPE_INFER_MASK_REG(op::v15::StringTensorUnpack, ShapeInferTA, util::bit::mask(0)), _OV_OP_SHAPE_INFER_MASK_REG(op::v15::StringTensorPack, ShapeInferTA, util::bit::mask(0, 1)), _OV_OP_SHAPE_INFER_MASK_REG(opset15::EmbeddingBagOffsets, ShapeInferTA, util::bit::mask()), diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index abf1ad8f283205..0e683482a97934 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -238,6 +238,17 @@ bool Transformations::fuse_type_to_fq(const std::shared_ptr& node, con return true; } +bool Transformations::fuse_type_to_pa(const std::shared_ptr& node, const precisions_map& precisions) { + auto pa = ov::as_type_ptr(node); + if (!pa) + return false; + // PagedAttentionExtension's 2nd output type should be kept f32. + // The reason is that the pagedattention node in CPU plugin hardcodes 2nd output type as f32. + // So, set f32 to the 2nd output type, which can avoid extra data type conversion during transformation. + pa->set_out_type(1, ov::element::f32); + return true; +} + bool Transformations::fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions) { auto convert = ov::as_type_ptr(node); if (!convert) @@ -391,7 +402,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) type_to_fuse_map fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}}; #else - type_to_fuse_map fuse_map = {}; + type_to_fuse_map fuse_map = {{ov::op::PagedAttentionExtension::get_type_info_static(), fuse_type_to_pa}}; #endif const bool keep_precision_sensitive_in_fp32 = true; CPU_REGISTER_PASS_COMMON(manager, diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h index 0b6a437f667747..33c26ab8aea9e4 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h @@ -48,6 +48,7 @@ class Transformations { static bool fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions); static bool fuse_type_to_fq(const std::shared_ptr& node, const precisions_map& precisions); + static bool fuse_type_to_pa(const std::shared_ptr& node, const precisions_map& precisions); }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/utils/serialize.cpp b/src/plugins/intel_cpu/src/utils/serialize.cpp index 6666d42fb4f586..f7fd337afa932e 100644 --- a/src/plugins/intel_cpu/src/utils/serialize.cpp +++ b/src/plugins/intel_cpu/src/utils/serialize.cpp @@ -58,19 +58,20 @@ void ModelDeserializer::set_info(pugi::xml_node& root, std::shared_ptr>(std::shared_ptr& model) { - if (auto mmap_stream = dynamic_cast(&m_istream)) { - process_mmap(model, mmap_stream->m_memory); + if (auto mmap_buffer = dynamic_cast(m_istream.rdbuf())) { + auto buffer = mmap_buffer->get_buffer(); + process_mmap(model, buffer); } else { process_stream(model); } } void ModelDeserializer::process_mmap(std::shared_ptr& model, - const std::shared_ptr& mmemory) { + const std::shared_ptr& mmemory) { // Note: Don't use seekg with mmaped stream. This may affect the performance of some models. // Get file size before seek content. // Blob from cache may have other header, so need to skip this. - auto buffer_base = mmemory->data(); + auto buffer_base = reinterpret_cast(mmemory->get_ptr()); const auto file_size = mmemory->size(); const size_t hdr_pos = m_istream.tellg(); @@ -98,9 +99,7 @@ void ModelDeserializer::process_mmap(std::shared_ptr& model, // Map blob content std::shared_ptr weights_buf; if (hdr.consts_size) { - weights_buf = std::make_shared>>(buffer_base + hdr.consts_offset, - hdr.consts_size, - mmemory); + weights_buf = std::make_shared>>(buffer_base + hdr.consts_offset, hdr.consts_size, mmemory); } // XML content diff --git a/src/plugins/intel_cpu/src/utils/serialize.hpp b/src/plugins/intel_cpu/src/utils/serialize.hpp index 817041452c9597..897a2c2e52f092 100644 --- a/src/plugins/intel_cpu/src/utils/serialize.hpp +++ b/src/plugins/intel_cpu/src/utils/serialize.hpp @@ -40,7 +40,7 @@ class ModelDeserializer { protected: static void set_info(pugi::xml_node& root, std::shared_ptr& model); - void process_mmap(std::shared_ptr& model, const std::shared_ptr& memory); + void process_mmap(std::shared_ptr& model, const std::shared_ptr& memory); void process_stream(std::shared_ptr& model); diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/activation.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/activation.cpp index 0f25351a020f60..307938fbfec17a 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/activation.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/activation.cpp @@ -193,6 +193,7 @@ std::string ActivationLayerCPUTest::getPrimitiveType(const utils::ActivationType (activation_type == utils::ActivationTypes::Relu) || (activation_type == utils::ActivationTypes::Sigmoid) || (activation_type == utils::ActivationTypes::SoftSign) || + (activation_type == utils::ActivationTypes::Sqrt) || (activation_type == utils::ActivationTypes::Swish) || (activation_type == utils::ActivationTypes::LogicalNot) || (activation_type == utils::ActivationTypes::Tanh))) { diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/matmul.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/matmul.cpp index 934a0f4bc95f18..9b5d7287875d7c 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/matmul.cpp @@ -35,8 +35,8 @@ const std::vector IS = { const std::vector IS_Dynamic = { { { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...} - {{-1, -1}, {{55, 12}, {33, 7}}}, // input 0 - {{-1, -1}, {{12, 55}, {7, 33}}} // input 1 + {{-1, -1}, {{55, 12}, {33, 7}, {33, 0}, {0, 33}}}, // input 0 + {{-1, -1}, {{12, 55}, {7, 33}, {0, 33}, {33, 0}}} // input 1 }, {false, false} }, diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp index eb6fdc2a6bfc3f..8a9212f8998f94 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp @@ -37,6 +37,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest, ConcatSDPTest, ::testing::Combine(::testing::Values(ElementType::f16), ::testing::ValuesIn(inputShapes), + ::testing::Values(false), ::testing::Values(true, false)), ConcatSDPTest::getTestCaseName); } // namespace diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp index f4abaa03b7c28b..f5a7bfacfac99f 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.cpp @@ -28,8 +28,9 @@ namespace test { std::string ConcatSDPTest::getTestCaseName(const testing::TestParamInfo& obj) { ElementType inType; std::vector inputShapes; - bool hasShapeof; - std::tie(inType, inputShapes, hasShapeof) = obj.param; + bool forceKVU8; + bool hasShapeOf; + std::tie(inType, inputShapes, forceKVU8, hasShapeOf) = obj.param; std::ostringstream result; result << "IS="; for (const auto& shape : inputShapes) { @@ -46,21 +47,24 @@ std::string ConcatSDPTest::getTestCaseName(const testing::TestParamInfo(gatherK); shapeof_v = std::make_shared(gatherV); } @@ -107,20 +111,20 @@ void ConcatSDPTest::SetUp() { pastv_assign->set_friendly_name("pastv_w"); ResultVector results{std::make_shared(add)}; - if (hasShapeOf) { + if (m_hasShapeOf) { results.push_back(std::make_shared(shapeof_k)); results.push_back(std::make_shared(shapeof_v)); } SinkVector sinks{pastk_assign, pastv_assign}; function = std::make_shared(results, sinks, inputParams, "ConcatSDP"); targetDevice = ov::test::utils::DEVICE_CPU; - functionRefs = function->clone(); pass::Manager manager; // decompose ScaledDotProductAttention manager.register_pass(); manager.run_passes(functionRefs); } + void ConcatSDPTest::generate_inputs(const std::vector& targetInputStaticShapes) { std::vector shapes(4); shapes[0] = targetInputStaticShapes[0]; @@ -129,6 +133,7 @@ void ConcatSDPTest::generate_inputs(const std::vector& targetInputSta shapes[3] = targetInputStaticShapes[1]; SubgraphBaseTest::generate_inputs(shapes); } + template void strided_iota(IT first, size_t n, T value, T stride) { for (size_t i = 0; i < n; i++) { @@ -136,6 +141,7 @@ void strided_iota(IT first, size_t n, T value, T stride) { value += stride; } } + void ConcatSDPTest::generate(int idx, const std::vector& targetInputStaticShapes) { inputs.clear(); auto create_input = [this] (std::shared_ptr param, ov::Shape shape, float val) { @@ -169,16 +175,19 @@ void ConcatSDPTest::generate(int idx, const std::vector& targetInputS create_input(function->get_parameters()[3], targetInputStaticShapes[1], idx + 4.0f); create_input(function->get_parameters()[4], ov::Shape{targetInputStaticShapes[0][0]}, idx + 0.0f); } + void ConcatSDPTest::prepare() { compile_model(); inferRequest = compiledModel.create_infer_request(); ASSERT_TRUE(inferRequest); } + void ConcatSDPTest::reset() { for (auto&& state : inferRequest.query_state()) { state.reset(); } } + std::vector ConcatSDPTest::run_test(std::shared_ptr model) { function = model; prepare(); @@ -201,6 +210,12 @@ std::vector ConcatSDPTest::run_test(std::shared_ptr model } TEST_P(ConcatSDPTest, CompareWithRefs) { SKIP_IF_CURRENT_TEST_IS_DISABLED(); + ElementType inType; + std::vector inputShapes; + bool forceKVU8; + bool hasShapeOf; + std::tie(inType, inputShapes, forceKVU8, hasShapeOf) = this->GetParam(); + auto actualOutputs = run_test(function); if (!hasShapeOf) { CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); @@ -216,9 +231,14 @@ TEST_P(ConcatSDPTest, CompareWithRefs) { } } } + + // the range of our result will exceed f16 max value and there may be 'inf'. In softmax, there is a step: + // v - max(v), if v is inf, the result of 'v-max(v)' will be nan + // use f32 as reference if (inType == ElementType::f16) { configuration["INFERENCE_PRECISION_HINT"] = "f32"; } + auto expectedOutputs = run_test(functionRefs); CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 0); for (size_t i = 0; i < actualOutputs.size(); i++) { diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp index 56fad11f53e600..ac59e48f496b3b 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/concat_sdp.hpp @@ -34,7 +34,7 @@ namespace test { template void strided_iota(IT first, size_t n, T value, T stride); -typedef std::tuple, bool> ConcatSDPTestParams; +typedef std::tuple, bool, bool> ConcatSDPTestParams; class ConcatSDPTest : public testing::WithParamInterface, @@ -46,7 +46,8 @@ class ConcatSDPTest : void prepare(); void reset(); std::vector run_test(std::shared_ptr model); - bool hasShapeOf; + bool m_forceKVU8; + bool m_hasShapeOf; protected: void generate_inputs(const std::vector& targetInputStaticShapes) override; void SetUp() override; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp index bc73de76999daf..d05e7840562191 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp @@ -18,6 +18,7 @@ namespace test { using InputShapeAndTransposeOrder = std::pair, std::vector>; using ConcatMultiQuerySDPParams = std::tuple; // Subgraph: @@ -52,8 +53,10 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface& obj) { ElementType qkvType; InputShapeAndTransposeOrder inputShapeAndOrders; - bool hasShapeof; - std::tie(qkvType, inputShapeAndOrders, hasShapeof) = obj.param; + bool forceKVU8; + bool hasShapeOf; + std::tie(qkvType, inputShapeAndOrders, forceKVU8, hasShapeOf) = obj.param; + ElementType kvCacheType = forceKVU8 ? ov::element::Type_t::u8 : qkvType; std::ostringstream result; std::vector& inputShapes = inputShapeAndOrders.first; std::vector& transposeOrder = inputShapeAndOrders.second; @@ -71,8 +74,9 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterfaceGetParam(); + std::tie(qkvType, inputShapeAndOrders, forceKVU8, hasShapeOf) = this->GetParam(); std::vector& inputShapes = inputShapeAndOrders.first; std::vector& transposeOrder = inputShapeAndOrders.second; targetDevice = ov::test::utils::DEVICE_CPU; rel_threshold = 1e-2f; configuration[ov::hint::inference_precision.name()] = ov::element::f32; - if (qkvType == ElementType::bf16) { - configuration[ov::hint::inference_precision.name()] = ov::element::bf16; + if (qkvType == ElementType::bf16 || qkvType == ElementType::f16) { + configuration[ov::hint::inference_precision.name()] = ov::element::Type(qkvType).get_type_name(); rel_threshold = 0.01f; } + if (forceKVU8) + configuration["KV_CACHE_PRECISION"] = "u8"; init_input_shapes(inputShapes); ov::ParameterVector inputParams; // q,k,v @@ -229,6 +236,10 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface(t.data()), t.get_size(), val, 0.1f); inputs.insert({param, t}); + } else if (param->get_element_type() == element::f16) { + ov::Tensor t{ov::element::f16, shape}; + strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); + inputs.insert({param, t}); } else { ov::Tensor t{ov::element::bf16, shape}; strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); @@ -269,6 +280,10 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface b.get_name(); + }); for (std::string name : {"pastk", "pastv"}) { auto itr = std::find_if(states.begin(), states.end(), [&](const ov::VariableState& state) { return name == state.get_name(); @@ -290,17 +305,20 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterfaceGetParam(); - if (qkvType == ElementType::bf16 && !ov::with_cpu_x86_bfloat16()) - GTEST_SKIP(); + std::tie(qkvType, inputShapeAndOrders, forceKVU8, hasShapeOf) = this->GetParam(); auto actualOutputs = run_test(function); CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); CheckNumberOfNodesWithType(compiledModel, "Concatenation", 0); CheckNumberOfNodesWithType(compiledModel, "Reorder", 0); CheckNumberOfNodesWithType(compiledModel, "Transpose", 1); CheckNumberOfNodesWithType(compiledModel, "Gather", 0); + // use f32 as reference + if (qkvType == ElementType::f16) { + configuration["INFERENCE_PRECISION_HINT"] = "f32"; + } auto expectedOutputs = run_test(functionRefs); CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 0); for (size_t i = 0; i < actualOutputs.size(); i++) { @@ -384,8 +402,9 @@ const std::vector inputShapeAndReorders = {{ INSTANTIATE_TEST_SUITE_P(smoke_ConcatMultiQuerySDPTest, ConcatMultiQuerySDPTest, - ::testing::Combine(::testing::Values(ElementType::f32, ElementType::bf16), + ::testing::Combine(::testing::Values(ElementType::f32, ElementType::bf16, ElementType::f16), ::testing::ValuesIn(inputShapeAndReorders), + ::testing::Values(true, false), ::testing::Values(true, false)), ConcatMultiQuerySDPTest::getTestCaseName); } // namespace diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp index f9971a7fe9ce16..57927434524891 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp @@ -37,8 +37,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest, ConcatSDPTest, ::testing::Combine(::testing::Values(ElementType::f32), ::testing::ValuesIn(inputShapes), + ::testing::Values(true, false), ::testing::Values(true, false)), ConcatSDPTest::getTestCaseName); + } // namespace } // namespace test diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp index 839370d3a97728..65bc379c78b540 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp @@ -253,6 +253,10 @@ class ConcatSDPTransposeTest : public ConcatSDPTransposeTestBase { outputs.push_back(copy); } auto states = inferRequest.query_state(); + // k, v may be in any order + std::sort(states.begin(), states.end(), [] (VariableState& a, VariableState& b) { + return a.get_name() > b.get_name(); + }); for (std::string name : {"pastk", "pastv"}) { auto itr = std::find_if(states.begin(), states.end(), [&](const ov::VariableState& state) { return name == state.get_name(); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/denormal_check.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/denormal_check.cpp index b98d4c61a1fb43..39fe70ebd87df4 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/denormal_check.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/denormal_check.cpp @@ -36,8 +36,8 @@ void SetUp() override { targetStaticShapes.push_back({inpShape}); targetDevice = ov::test::utils::DEVICE_CPU; - const auto elemsCount = shape_size(inpShape); const auto rtPrc = ov::element::f32; + const auto elemsCount = shape_size(inpShape) * rtPrc.size(); ov::ParameterVector params {std::make_shared(rtPrc, ov::Shape(inpShape))}; pConstStorage.reset(new ov::AlignedBuffer(elemsCount, alignment)); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp new file mode 100644 index 00000000000000..93c99048fec349 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp @@ -0,0 +1,47 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "openvino/opsets/opset13.hpp" +#include "openvino/pass/manager.hpp" +#include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp" + +#include "custom/subgraph_tests/src/classes/concat_sdp.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "utils/cpu_test_utils.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" + +using namespace CPUTestUtils; + +namespace ov { +namespace test { + +namespace { +const std::vector> inputShapes = { + // greedy search + { + // B, H, L1, S + {{1, 8, -1, 64}, {{1, 8, 10, 64}, {1, 8, 1, 64}, {1, 8, 1, 64}, {1, 8, 20, 64}, {1, 8, 1, 64}}}, + // B, H, L0, S + {{1, 8, -1, 64}, {{1, 8, 0, 64}, {1, 8, 10, 64}, {1, 8, 11, 64}, {1, 8, 12, 64}, {1, 8, 32, 64}}}, + }, + // beam search + { + // B, H, L1, S + {{-1, 8, -1, 64}, {{4, 8, 10, 64}, {4, 8, 1, 64}, {4, 8, 1, 64}, {4, 8, 1, 64}, {4, 8, 1, 64}}}, + // B, H, L0, S + {{-1, 8, -1, 64}, {{4, 8, 0, 64}, {4, 8, 10, 64}, {4, 8, 11, 64}, {4, 8, 12, 64}, {4, 8, 13, 64}}}, + }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest, + ConcatSDPTest, + ::testing::Combine(::testing::Values(ElementType::bf16, ElementType::f16), + ::testing::ValuesIn(inputShapes), + ::testing::Values(true, false), + ::testing::Values(true, false)), + ConcatSDPTest::getTestCaseName); + +} // namespace + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 2a8f49b5dcfe0e..e7c006ab97427f 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -358,6 +358,8 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(smoke_VariableState/OVInferRequestVariableStateTest.*)"); // Issue: 141705 retVector.emplace_back(R"(.*smoke_arm_Deconv_2D_Planar_FP16/DeconvolutionLayerCPUTest.*INFERENCE_PRECISION_HINT=f16.*)"); + // Issue: 154882 + retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)"); #endif #if defined(OPENVINO_ARCH_ARM) @@ -529,6 +531,8 @@ std::vector disabledTestPatterns() { if (!ov::with_cpu_x86_avx512_core_fp16()) { // Skip fp16 tests for paltforms that don't support fp16 precision retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); + retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)"); + retVector.emplace_back(R"(.*ConcatSDPTest.*f16.*)"); } #elif defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM) if (!ov::intel_cpu::hasHardwareSupport(ov::element::f16)) { @@ -560,6 +564,7 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*smoke_Snippets_EnforcePrecision_bf16.*)"); retVector.emplace_back(R"(.*smoke_Snippets_MHAWOTransposeEnforceBF16.*)"); retVector.emplace_back(R"(.*smoke_Snippets_MHAEnforceBF16.*)"); + retVector.emplace_back(R"(.*ConcatSDPTest.*bf16.*)"); } // [150842] Need to support dynamic K dimension of BF16|INT8 MatMul on AMX systems if (ov::with_cpu_x86_avx512_core_amx()) { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp new file mode 100644 index 00000000000000..d85ced5f07a92e --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/lora_pattern.cpp @@ -0,0 +1,21 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_tests/lora_pattern.hpp" + +using namespace ov::test; + +namespace { + +INSTANTIATE_TEST_SUITE_P(smoke, + LoraPatternConvolution, + ::testing::Values(ov::test::utils::DEVICE_CPU), + LoraPatternBase::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke, + LoraPatternMatmul, + ::testing::Values(ov::test::utils::DEVICE_CPU), + LoraPatternBase::getTestCaseName); + +} // namespace diff --git a/src/plugins/intel_cpu/tests/unit/brgemm_executor_test.cpp b/src/plugins/intel_cpu/tests/unit/brgemm_executor_test.cpp index 35a29f97452d4b..9ae58561d4dfcd 100644 --- a/src/plugins/intel_cpu/tests/unit/brgemm_executor_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/brgemm_executor_test.cpp @@ -28,18 +28,19 @@ void run_test(ov::element::Type rtPrec) { size_t K = 33; ov::intel_cpu::BrgemmKernel gemm(M, N, K, K, N, N, false, rtPrec); size_t nthr = 8; - bool is_bf16 = (rtPrec == ov::element::bf16); + bool is_f32 = (rtPrec == ov::element::f32); std::vector a_data(M * K, (1.0f/33)); std::vector b_data(K * N, 4.0f); std::vector c_data(nthr * M * N, 0.0f); std::vector wsp(nthr * 4 * 1024, 0.0f); - std::vector b_scracth(gemm.get_scratch_b_size(), 0.0f); - std::vector a_scracth(gemm.get_scratch_a_size(), 0.0f); - if (is_bf16) - gemm.copy_buffer_b(b_data.data(), b_scracth.data()); + std::vector a_scratch(gemm.get_scratch_a_size(), 0.0f); + std::vector b_scratch(gemm.get_scratch_b_size(), 0.0f); + if (!is_f32) { + gemm.copy_buffer_b(b_data.data(), b_scratch.data()); + } auto m_block_size = gemm.get_mblk_size(); auto m_blocks = (M + gemm.get_mblk_size() - 1) / m_block_size; - T* b_ptr = is_bf16 ? b_scracth.data() : b_data.data(); + void* b_ptr = !is_f32 ? static_cast(b_scratch.data()) : static_cast(b_data.data()); ov::parallel_for2d(nthr, m_blocks, [&](size_t i, size_t m_blk) { auto m_start = m_blk * m_block_size; auto m_end = std::min(m_start + m_block_size, M); @@ -49,7 +50,7 @@ void run_test(ov::element::Type rtPrec) { b_ptr, c_data.data() + i * M * N + m_start * N, wsp.data() + i * 4 * 1024, - a_scracth.data()); + a_scratch.data()); }); ov::parallel_for(nthr, [&](size_t i){ for (size_t m = 0; m < M; m++) { @@ -73,9 +74,13 @@ TEST_P(BrgemmKernelTest, simpleGemmTest) { GTEST_SKIP(); if (rtPrec == ov::element::f32 && !ov::with_cpu_x86_avx512_core()) GTEST_SKIP(); + if (rtPrec == ov::element::f16 && !ov::with_cpu_x86_avx512_core_fp16()) + GTEST_SKIP(); if (rtPrec == ov::element::bf16) { run_test(rtPrec); + } else if (rtPrec == ov::element::f16) { + run_test(rtPrec); } else { run_test(rtPrec); } @@ -83,6 +88,6 @@ TEST_P(BrgemmKernelTest, simpleGemmTest) { INSTANTIATE_TEST_SUITE_P(BrgemmKernelUnitTest, BrgemmKernelTest, - ::testing::Values(ov::element::f32, ov::element::bf16), + ::testing::Values(ov::element::f32, ov::element::bf16, ov::element::f16), BrgemmKernelTest::getTestCaseName); } // namespace brgemmUnitTest diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/search_sorted_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/search_sorted_shape_inference_test.cpp new file mode 100644 index 00000000000000..ac0b4763b7bf5d --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/search_sorted_shape_inference_test.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "common_test_utils/test_assertions.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using ov::op::v0::Constant; +using ov::op::v0::Parameter; +using testing::HasSubstr; + +class SearchSortedShapeInferenceTest : public OpStaticShapeInferenceTest {}; + +TEST_F(SearchSortedShapeInferenceTest, same_dimensions_nd_inputs) { + const auto sorted = std::make_shared(element::i64, PartialShape::dynamic()); + const auto values = std::make_shared(element::i64, PartialShape::dynamic()); + const auto op = make_op(sorted, values); + const auto input_shapes = ShapeVector{StaticShape{1, 3, 6}, StaticShape{1, 3, 6}}; + const auto output_shapes = shape_inference(op.get(), input_shapes); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 6})); +} + +TEST_F(SearchSortedShapeInferenceTest, scalar_values) { + const auto sorted = std::make_shared(element::i64, PartialShape::dynamic()); + const auto values = std::make_shared(element::i64, PartialShape::dynamic()); + const auto op = make_op(sorted, values); + const auto input_shapes = ShapeVector{StaticShape{3}, StaticShape{}}; + const auto output_shapes = shape_inference(op.get(), input_shapes); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes.front(), StaticShape{}); +} + +TEST_F(SearchSortedShapeInferenceTest, different_last_dim) { + const auto sorted = std::make_shared(element::i64, PartialShape::dynamic()); + const auto values = std::make_shared(element::i64, PartialShape::dynamic()); + const auto op = make_op(sorted, values); + const auto input_shapes = ShapeVector{StaticShape{1, 3, 7, 100}, StaticShape{1, 3, 7, 10}}; + const auto output_shapes = shape_inference(op.get(), input_shapes); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 7, 10})); +} + +TEST_F(SearchSortedShapeInferenceTest, 1d_inputs) { + const auto sorted = std::make_shared(element::i64, PartialShape::dynamic()); + const auto values = std::make_shared(element::i64, PartialShape::dynamic()); + const auto op = make_op(sorted, values); + const auto input_shapes = ShapeVector{StaticShape{5}, StaticShape{20}}; + const auto output_shapes = shape_inference(op.get(), input_shapes); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes.front(), StaticShape({20})); +} + +TEST_F(SearchSortedShapeInferenceTest, 1d_sequence) { + const auto sorted = std::make_shared(element::i64, PartialShape::dynamic()); + const auto values = std::make_shared(element::i64, PartialShape::dynamic()); + const auto op = make_op(sorted, values); + const auto input_shapes = ShapeVector{StaticShape{50}, StaticShape{1, 3, 7, 10}}; + const auto output_shapes = shape_inference(op.get(), input_shapes); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 7, 10})); +} + +TEST_F(SearchSortedShapeInferenceTest, element_type_consistency_validation) { + const auto sorted = std::make_shared(element::i64, PartialShape::dynamic()); + const auto values = std::make_shared(element::i32, PartialShape::dynamic()); + OV_EXPECT_THROW(std::ignore = make_op(sorted, values), + NodeValidationFailure, + testing::HasSubstr("must have the same element type")); +} + +TEST_F(SearchSortedShapeInferenceTest, input_shapes_ranks_validation) { + const auto sorted = std::make_shared(element::i32, PartialShape::dynamic()); + const auto values = std::make_shared(element::i32, PartialShape::dynamic()); + const auto op = make_op(sorted, values); + const auto input_shapes = ShapeVector{StaticShape{1, 3, 6}, StaticShape{1, 3, 6, 7}}; + OV_EXPECT_THROW(std::ignore = shape_inference(op.get(), input_shapes), + NodeValidationFailure, + testing::HasSubstr("the ranks of the inputs have to be compatible")); +} + +TEST_F(SearchSortedShapeInferenceTest, input_shapes_compatibility) { + const auto sorted = std::make_shared(element::i32, PartialShape::dynamic()); + const auto values = std::make_shared(element::i32, PartialShape::dynamic()); + const auto op = make_op(sorted, values); + const auto input_shapes = ShapeVector{StaticShape{1, 3, 6}, StaticShape{1, 6, 6}}; + OV_EXPECT_THROW(std::ignore = shape_inference(op.get(), input_shapes), + NodeValidationFailure, + testing::HasSubstr("All dimensions but the last one have to be compatible")); +} + +TEST_F(SearchSortedShapeInferenceTest, scalar_sorted_sequence) { + const auto sorted = std::make_shared(element::i32, PartialShape::dynamic()); + const auto values = std::make_shared(element::i32, PartialShape::dynamic()); + const auto op = make_op(sorted, values); + const auto input_shapes = ShapeVector{StaticShape{}, StaticShape{1, 6, 6}}; + OV_EXPECT_THROW(std::ignore = shape_inference(op.get(), input_shapes), + NodeValidationFailure, + testing::HasSubstr("The sorted sequence input cannot be a scalar")); +} + +TEST_F(SearchSortedShapeInferenceTest, scalar_values_and_ND_sequence) { + const auto sorted = std::make_shared(element::i32, PartialShape::dynamic()); + const auto values = std::make_shared(element::i32, PartialShape::dynamic()); + const auto op = make_op(sorted, values); + const auto input_shapes = ShapeVector{StaticShape{2, 3}, StaticShape{}}; + OV_EXPECT_THROW(std::ignore = shape_inference(op.get(), input_shapes), + NodeValidationFailure, + testing::HasSubstr("the ranks of the inputs have to be compatible")); +} diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp index 8f4ae2c66334ee..d7933e2180fe6f 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp @@ -26,7 +26,11 @@ struct rope : public primitive_base { size_t gather_rank = 0) : primitive_base(id, inputs), config(config), - gather_rank(gather_rank) {} + gather_rank(gather_rank) { + OPENVINO_ASSERT((!config.support_2d_rope + || (config.support_2d_rope && config.is_chatglm)), + "2D RoPE is currently only supported in Chatglm!"); + } RoPE::Config config; size_t gather_rank = 0; @@ -38,6 +42,7 @@ struct rope : public primitive_base { seed = hash_combine(seed, config.head_size); seed = hash_combine(seed, config.input_trans0213); seed = hash_combine(seed, config.is_chatglm); + seed = hash_combine(seed, config.support_2d_rope); seed = hash_combine(seed, config.is_interleaved); seed = hash_combine(seed, config.is_qwen); seed = hash_combine(seed, config.rotary_ndims); @@ -58,6 +63,7 @@ struct rope : public primitive_base { config.head_size == rhs_casted.config.head_size && config.input_trans0213 == rhs_casted.config.input_trans0213 && config.is_chatglm == rhs_casted.config.is_chatglm && + config.support_2d_rope == rhs_casted.config.support_2d_rope && config.is_interleaved == rhs_casted.config.is_interleaved && config.is_qwen == rhs_casted.config.is_qwen && config.rotary_ndims == rhs_casted.config.rotary_ndims && @@ -73,6 +79,7 @@ struct rope : public primitive_base { ob << config.head_size; ob << config.input_trans0213; ob << config.is_chatglm; + ob << config.support_2d_rope; ob << config.is_interleaved; ob << config.is_qwen; ob << config.rotary_ndims; @@ -88,6 +95,7 @@ struct rope : public primitive_base { ib >> config.head_size; ib >> config.input_trans0213; ib >> config.is_chatglm; + ib >> config.support_2d_rope; ib >> config.is_interleaved; ib >> config.is_qwen; ib >> config.rotary_ndims; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp index f869feba4a5334..049e7a29cb9c23 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp @@ -48,8 +48,8 @@ struct memory { virtual ~memory() = default; virtual void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) = 0; virtual void unlock(const stream& stream) = 0; - virtual event::ptr fill(stream& stream, unsigned char pattern) = 0; - virtual event::ptr fill(stream& stream) = 0; + virtual event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) = 0; + virtual event::ptr fill(stream& stream, bool blocking = true) = 0; // only supports gpu_usm virtual void* buffer_ptr() const { return nullptr; } @@ -147,8 +147,8 @@ struct simple_attached_memory : memory { void* lock(const stream& /* stream */, mem_lock_type /* type */) override { return _pointer; } void unlock(const stream& /* stream */) override {} - event::ptr fill(stream& /* stream */, unsigned char) override { return nullptr; } - event::ptr fill(stream& /* stream */) override { return nullptr; } + event::ptr fill(stream& /* stream */, unsigned char, bool) override { return nullptr; } + event::ptr fill(stream& /* stream */, bool) override { return nullptr; } shared_mem_params get_internal_params() const override { return { shared_mem_type::shared_mem_empty, nullptr, nullptr, nullptr, #ifdef _WIN32 nullptr, diff --git a/src/plugins/intel_gpu/src/graph/crop.cpp b/src/plugins/intel_gpu/src/graph/crop.cpp index e17cc3e5552849..e3ff36ceae38a5 100644 --- a/src/plugins/intel_gpu/src/graph/crop.cpp +++ b/src/plugins/intel_gpu/src/graph/crop.cpp @@ -50,7 +50,7 @@ std::vector crop_inst::calc_output_layouts(const crop_node& /*node*/, co std::vector input_shapes = { impl_param.input_layouts[0].get(), }; - for (size_t i = 1; i < impl_param.input_layouts.size(); ++i) { + for (size_t i = 1; i < desc->input.size(); ++i) { input_shapes.push_back(impl_param.input_layouts[i].get()); } int64_t axis = desc->axis; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index b7017c414c505f..7bdbc53ad54d16 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -461,7 +461,7 @@ bool crop_in_place_optimization::match(const program_node& node, return false; // if the node is marked as network output, prevent optimizations which would affect a form of its output, // unless debug flag is set - if (node.is_output() || crop_params.fused_desc.size() > 0 || node.is_in_shape_of_subgraph()) + if (node.is_output() || crop_params.has_fused_primitives() || node.is_in_shape_of_subgraph()) return false; const auto& crop_layout = crop_params.get_output_layout(); @@ -547,6 +547,9 @@ bool crop_in_place_optimization::optimize(crop_node& node) { auto input_layout = node.get_input_layout(0); auto crop_params = node.get_kernel_impl_params(); + if (crop_params->has_fused_primitives()) + return false; + // Regular crop // crop input buffer // |___________data____________| diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index b42ab89eafd61a..5e8380f35dcb93 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -736,6 +736,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { should_fuse |= input.is_type(); + should_fuse |= input.is_type(); + bool legacy_fusion = activation_node.get_dependencies().size() == 1 && !input.can_be_optimized() && !activation_node.is_constant() && @@ -920,7 +922,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { (parents[i].first->is_type()) || (parents[i].first->is_type() && reduce_supports_fusings(parents[i].first->as())) || - (parents[i].first->is_type()); + (parents[i].first->is_type()) || + (parents[i].first->is_type()); } // Disable fusion to a node on constant path when second input is in data flow @@ -1045,6 +1048,15 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { std::swap(fused_idx, peer_idx); } + // Avoid fusing with GEMM from the LoRA pattern, that can be optimized in case of empty adapters + if (parents[fused_idx].first->is_type()) { + if (parents[peer_idx].first->is_type() || + (parents[peer_idx].first->is_type() && + parents[peer_idx].first->get_dependency(0).is_type())) { + std::swap(fused_idx, peer_idx); + } + } + auto fused_node = parents[fused_idx].first; auto peer_node = parents[peer_idx].first; if (lo.get_optimization_attributes().use_onednn_impls && lo.is_primitive_implemented_for_onednn(*fused_node)) { diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index dff6b16d30a2ad..28ee84c4a4ec02 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -434,7 +434,7 @@ void remove_redundant_reorders::run(program& p) { (input.is_type() || input.is_type() || input.is_type() || input.is_type() || input.is_type() || input.is_type() || input.is_type() || input.is_type() || input.is_type() || - input.is_type() || input.is_type()) && !input.is_constant(); if (!same_data_type && !allowed_dt_conversion_fuse) continue; diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp index 6c16618ac816d0..5692b6037a09e0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp @@ -59,7 +59,7 @@ struct read_value_impl : public typed_primitive_impl { if (instance.get_impl_params()->input_layouts.size() > 0) { variable.get_memory()->copy_from(stream, instance.dep_memory(0), true); } else { - variable.get_memory()->fill(stream, 0); + variable.get_memory()->fill(stream); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp index 41934847f899de..174ea1fa1767a9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp @@ -154,6 +154,13 @@ struct gemm_impl : multi_stage_primitive { } event::ptr execute_impl(const std::vector& events, gemm_inst& instance) override { + if (instance.get_input_layout(0).count() == 0 || + instance.get_input_layout(1).count() == 0) { + stream& stream = instance.get_network().get_stream(); + stream.enqueue_barrier(); + return instance.output_memory_ptr()->fill(stream, false); + } + if (need_indirect_load(instance)) return execute_stage(events, instance, indirect_gemm); else diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp index d8f0e45c25146f..8c08afc0428432 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp @@ -37,7 +37,7 @@ struct count_nonzero_impl : typed_primitive_impl_ocl { event::ptr execute_impl(const std::vector& events, count_nonzero_inst& instance) override { if (instance.get_impl_params()->input_layouts[0].count() == 0) { // set count of non-zero elements to 0 in case if input tensor is empty to have correct memory alloc for gather_nonzero - return instance.output_memory(0).fill(instance.get_network().get_stream(), 0); + return instance.output_memory(0).fill(instance.get_network().get_stream()); } else { return parent::execute_impl(events, instance); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp index f65768b8e6eb20..7764b7b0964d1c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp @@ -53,6 +53,7 @@ struct rope_impl : typed_primitive_impl_ocl { params.is_qwen = primitive->config.is_qwen; params.is_chatglm = primitive->config.is_chatglm; + params.support_2d_rope = primitive->config.support_2d_rope; params.transposed_input = primitive->config.input_trans0213; for (size_t i = 1; i < impl_param.input_layouts.size(); ++i) { diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 13634b49fd9d96..095dc5fd45fa52 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -655,7 +655,7 @@ event::ptr primitive_inst::realloc_if_needed() { } } - // Clear out memory if if was previously reused, but now primitive can't be optimized + // Clear out memory if was previously reused, but now primitive can't be optimized if (!_node->is_type() && (_node->is_runtime_skippable() || _node->is_type())) { if (can_be_optimized()) { _max_output_layout_count = _deps[0].first->_max_output_layout_count; @@ -663,7 +663,7 @@ event::ptr primitive_inst::realloc_if_needed() { return ev; } else if (_outputs[0] && dep_memory_ptr(0) && _network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) { - // Clear out memory if if was previously reused, but now primitive can't be optimized + // Clear out memory if was previously reused, but now primitive can't be optimized _outputs[0] = nullptr; _max_output_layout_count[0] = 0; } @@ -1527,7 +1527,7 @@ event::ptr primitive_inst::execute(const std::vector& events) { } if (can_skip_execution) { - auto ev = get_network().get_stream().create_user_event(true); + auto ev = get_network().get_stream().aggregate_events(events); update_shape_done_by_other = false; // reset return ev; } diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 03cc8df8b4338c..3a3793e8ad764d 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -1898,6 +1898,7 @@ void program::load(cldnn::BinaryInputBuffer& ib) { _loaded_from_cache = true; processing_order.load(ib, *this); + set_layout_optimizer_attributes(*_layout_optimizer); { auto& kernels_cache = get_kernels_cache(); diff --git a/src/plugins/intel_gpu/src/graph/rope.cpp b/src/plugins/intel_gpu/src/graph/rope.cpp index ea904916d4cf41..e168626f8d69a2 100644 --- a/src/plugins/intel_gpu/src/graph/rope.cpp +++ b/src/plugins/intel_gpu/src/graph/rope.cpp @@ -30,11 +30,24 @@ std::vector rope_inst::calc_output_layouts(rope_node const& node, kernel ShapeType output_shape = input0_shape; - if (desc->config.is_qwen || desc->config.is_chatglm) { + if (desc->config.is_qwen) { output_shape = { input0_shape[0], input0_shape[1], ov::Dimension(desc->config.head_cnt), ov::Dimension(desc->config.head_size) }; + } else if (desc->config.is_chatglm) { + if (desc->config.support_2d_rope) { + // input0_shape = [batch_size, seq_length] + output_shape = { input0_shape[0], + ov::Dimension(desc->config.head_cnt), + input0_shape[1], + ov::Dimension(desc->config.head_size) }; + } else { + output_shape = { input0_shape[0], + input0_shape[1], + ov::Dimension(desc->config.head_cnt), + ov::Dimension(desc->config.head_size) }; + } } else { auto input_slice_size = desc->config.slice_stop - desc->config.slice_start; if (input_slice_size > 0) { @@ -63,6 +76,7 @@ std::string rope_inst::to_string(rope_node const& node) { rope_info.add("head_size", desc->config.head_size); rope_info.add("input_trans0213", desc->config.input_trans0213); rope_info.add("is_chatglm", desc->config.is_chatglm); + rope_info.add("support_2d_rope", desc->config.support_2d_rope); rope_info.add("is_interleaved", desc->config.is_interleaved); rope_info.add("is_qwen", desc->config.is_qwen); rope_info.add("rotary_ndims", desc->config.rotary_ndims); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl index 36d4306b59ba79..38066b4461def4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl @@ -11,12 +11,22 @@ KERNEL(rope_ref)( const __global INPUT1_TYPE* cos_sin, __global OUTPUT_TYPE* output) { +#ifdef SUPPORT_2D_ROPE + const uint p = get_global_id(0) / HEAD_COUNT; + const uint h = get_global_id(0) % HEAD_COUNT; + const uint b = get_global_id(1);//sequence length + const uint rf = get_global_id(2);//max(HALF_ROTARY_NDIMS, HEAD_SIZE - ROTARY_NDIMS) + uint output_idx = OUTPUT_GET_INDEX(p, h, b, 0); +#else const uint p = get_global_id(0); const uint b = get_global_id(1); const uint h = (uint)get_global_id(2) % HEAD_COUNT; const uint rf = (uint)get_global_id(2) / HEAD_COUNT; + uint output_idx = OUTPUT_GET_INDEX(p, b, h, 0); +#endif + uint r = rf < HALF_ROTARY_NDIMS ? rf * 2 : 0; - uint f = rf < HEAD_SIZE - ROTARY_NDIMS ? rf : 0; + uint f = rf < HEAD_SIZE - ROTARY_NDIMS ? rf * 2 : 0; #ifdef ENABLE_SLICE uint input_idx = GET_DATA_INDEX(SLICED_INPUT0, p, b, h * HEAD_SIZE, 0); @@ -30,19 +40,18 @@ KERNEL(rope_ref)( uint cos_sin_b = b < INPUT1_FEATURE_NUM ? b : 0; uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_p, cos_sin_b, 0, 0); - uint output_idx = OUTPUT_GET_INDEX(p, b, h, 0); - - INPUT1_TYPE cosv = cos_sin[cos_sin_idx + r]; - INPUT1_TYPE sinv = cos_sin[cos_sin_idx + r + 1]; + float cosv = convert_float(cos_sin[cos_sin_idx + r]); + float sinv = convert_float(cos_sin[cos_sin_idx + r + 1]); - INPUT0_TYPE in1 = input[input_idx + r]; - INPUT0_TYPE in2 = input[input_idx + r + 1]; + float in1 = convert_float(input[input_idx + r]); + float in2 = convert_float(input[input_idx + r + 1]); - output[output_idx + r] = cosv * in1 - sinv * in2; - output[output_idx + r + 1] = sinv * in1 + cosv * in2; + output[output_idx + r] = TO_OUTPUT_TYPE(cosv * in1 - sinv * in2); + output[output_idx + r + 1] = TO_OUTPUT_TYPE(sinv * in1 + cosv * in2); #ifdef ENABLE_IO_COPY output[output_idx + ROTARY_NDIMS + f] = input[input_idx + ROTARY_NDIMS + f]; + output[output_idx + ROTARY_NDIMS + f + 1] = input[input_idx + ROTARY_NDIMS + f + 1]; #endif } #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/slice_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/slice_ref.cl index c9e2c0688e1968..ba36ee859412ec 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/slice_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/slice_ref.cl @@ -24,7 +24,7 @@ out_name[4] = in_prefix##_VAL4; #endif -KERNEL(slice_ref)(OPTIONAL_SHAPE_INFO_ARG +KERNEL(slice_ref)(OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* restrict input, START_BUFFER STEP_BUFFER diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp index a9e0818aeae2f5..a48632f6c45509 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp @@ -70,6 +70,9 @@ JitConstants RoPEKernelBase::GetJitConstants(const rope_params& params, RoPEKern if (params.is_qwen) { jit.AddConstant(MakeJitConstant("QWEN", true)); } else if (params.is_chatglm) { + if (params.support_2d_rope) { + jit.AddConstant(MakeJitConstant("SUPPORT_2D_ROPE", true)); + } jit.AddConstant(MakeJitConstant("CHATGLM", true)); } else { jit.AddConstant(MakeJitConstant("RotateHalf", true)); @@ -85,10 +88,22 @@ RoPEKernelBase::DispatchData RoPEKernelBase::SetDefault(const rope_params& param std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, { Tensor::DataChannelName::FEATURE }, { Tensor::DataChannelName::Y, Tensor::DataChannelName::X }}; - if (params.is_chatglm || params.is_qwen) { + if (params.is_qwen) { dispatchData.gws = {input.Batch().v, input.Feature().v, params.head_cnt * std::max(params.rotary_ndims / 2ul, params.head_size - params.rotary_ndims)}; + } else if (params.is_chatglm) { + if (params.support_2d_rope) { + // input [batch_size, seq_length] + // output [batch_size, head_count, seq_length, half_rotary_ndims] + dispatchData.gws = {input.Batch().v * params.head_cnt, + input.Feature().v, + params.rotary_ndims / 2ul}; + } else { + dispatchData.gws = {input.Batch().v, + input.Feature().v, + params.head_cnt * (params.rotary_ndims / 2ul)}; + } } else { dispatchData.gws = {output.Batch().v, output.Feature().v, diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.h index 5d55fd082765e8..472131eba5d82f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.h @@ -24,6 +24,7 @@ struct rope_params : public base_params { bool is_qwen = false; bool is_chatglm = false; + bool support_2d_rope = false; bool transposed_input = false; }; diff --git a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp index 9d7d6854009316..d655e297e4a2c6 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp @@ -45,6 +45,12 @@ void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr op) { + return ov::is_type(op) || + ov::is_type(op) || + ov::is_type(op); +} + void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {0, 1}); CreateVariableAccessPrimitive(p, op, op->get_variable_id()); @@ -57,6 +63,9 @@ void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {1}); + if (IsReadValueOp(op->get_input_node_shared_ptr(0))) { + return; + } CreateVariableAccessPrimitive(p, op, op->get_variable_id()); } @@ -67,6 +76,9 @@ void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {1}); + if (IsReadValueOp(op->get_input_node_shared_ptr(0))) { + return; + } CreateVariableAccessPrimitive(p, op, op->get_variable_id()); } diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 7ee587e612ad3d..4ea7851b3f8c58 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -575,7 +575,8 @@ std::vector Plugin::get_supported_internal_properties() const ov::PropertyName{ov::internal::exclusive_async_requests.name(), ov::PropertyMutability::RW}, ov::PropertyName{ov::internal::compiled_model_runtime_properties.name(), ov::PropertyMutability::RO}, ov::PropertyName{ov::internal::compiled_model_runtime_properties_supported.name(), ov::PropertyMutability::RO}, - ov::PropertyName{ov::internal::query_model_ratio.name(), PropertyMutability::RW}}; + ov::PropertyName{ov::internal::query_model_ratio.name(), PropertyMutability::RW}, + ov::PropertyName{ov::internal::caching_with_mmap.name(), PropertyMutability::RO}}; return supported_internal_properties; } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 40c7ab48c486cb..f173e378fca3f9 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -862,7 +862,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { const size_t zp_pad_size = device_info.supports_immad ? 16 : 32; manager.register_pass(zp_pad_size, device_info.supports_immad); - manager.register_pass(); + manager.register_pass(true); pass_config->disable(); pass_config->disable(); pass_config->disable(); diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp index b24ddbd314a0cd..6b1c8d0cfc993f 100644 --- a/src/plugins/intel_gpu/src/plugin/variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/variable_state.cpp @@ -70,6 +70,11 @@ void VariableState::set_state(const ov::SoPtr& state) { m_layout.set_partial_shape(src_shape); update_device_buffer(); + if (actual_size == 0) { + set(); + return; + } + // check whether the src tensor is padded std::vector src_stride_no_pad(src_rank, 1); std::vector upper_pad(src_rank, 0); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp index a2ddc7dd2a4dff..f7e5ada9e24ef1 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp @@ -91,15 +91,15 @@ void gpu_buffer::unlock(const stream& stream) { } } -event::ptr gpu_buffer::fill(stream& stream) { +event::ptr gpu_buffer::fill(stream& stream, bool blocking) { if (_bytes_count == 0) { GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl; return stream.create_user_event(true); } - return fill(stream, 0); + return fill(stream, 0, blocking); } -event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) { +event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern, bool blocking) { if (_bytes_count == 0) { GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl; return stream.create_user_event(true); @@ -109,6 +109,9 @@ event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) { cl::Event& ev_ocl = downcast(ev.get())->get(); try { cl_stream.get_cl_queue().enqueueFillBuffer(_buffer, pattern, 0, size(), nullptr, &ev_ocl); + if (blocking) { + ev_ocl.wait(); + } } catch (cl::Error const& err) { OPENVINO_THROW(OCL_ERR_MSG_FMT(err)); } @@ -272,15 +275,15 @@ gpu_image2d::gpu_image2d(ocl_engine* engine, _slice_pitch = _buffer.getImageInfo(); } -event::ptr gpu_image2d::fill(stream& stream) { +event::ptr gpu_image2d::fill(stream& stream, bool blocking) { if (_bytes_count == 0) { GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl; return stream.create_user_event(true); } - return fill(stream, 0); + return fill(stream, 0, blocking); } -event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern) { +event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern, bool blocking) { if (_bytes_count == 0) { GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl; return stream.create_user_event(true); @@ -291,6 +294,9 @@ event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern) { cl_uint4 pattern_uint4 = {{pattern, pattern, pattern, pattern}}; try { cl_stream.get_cl_queue().enqueueFillImage(_buffer, pattern_uint4, {0, 0, 0}, {_width, _height, 1}, 0, &ev_ocl); + if (blocking) { + ev_ocl.wait(); + } } catch (cl::Error const& err) { OPENVINO_THROW(OCL_ERR_MSG_FMT(err)); } @@ -509,7 +515,7 @@ void gpu_usm::unlock(const stream& /* stream */) { } } -event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) { +event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, bool blocking) { if (_bytes_count == 0) { GPU_DEBUG_TRACE_DETAIL << "Skip gpu_usm::fill for 0 size tensor" << std::endl; return stream.create_user_event(true); @@ -517,14 +523,12 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) { auto& cl_stream = downcast(stream); auto ev = stream.create_base_event(); cl::Event& ev_ocl = downcast(ev.get())->get(); - // enqueueFillUsm call will never finish. Driver bug? Uncomment when fixed. Some older drivers doesn't support enqueueFillUsm call at all. - // cl_stream.get_usm_helper().enqueue_fill_mem(cl_stream.get_cl_queue(), _buffer.get(), pattern, _bytes_count, nullptr, &ev_ocl) - // Workarounded with enqeue_memcopy. ToDo: Remove below code. Uncomment above. - std::vector temp_buffer(_bytes_count, pattern); - // TODO: Do we really need blocking call here? Non-blocking one causes accuracy issues right now, but hopefully it can be fixed in more performant way. - const bool blocking = true; try { - cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), _buffer.get(), temp_buffer.data(), _bytes_count, blocking, nullptr, &ev_ocl); + cl_stream.get_usm_helper().enqueue_fill_mem( + cl_stream.get_cl_queue(), _buffer.get(), static_cast(&pattern), sizeof(unsigned char), _bytes_count, nullptr, &ev_ocl); + if (blocking) { + ev_ocl.wait(); + } } catch (cl::Error const& err) { OPENVINO_THROW(OCL_ERR_MSG_FMT(err)); } @@ -532,7 +536,7 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) { return ev; } -event::ptr gpu_usm::fill(stream& stream) { +event::ptr gpu_usm::fill(stream& stream, bool blocking) { // event::ptr ev{ new base_event(_context), false }; // cl::Event ev_ocl = downcast(ev.get())->get(); // cl::usm::enqueue_set_mem(cl_stream.get_cl_queue(), _buffer.get(), 0, _bytes_count, nullptr, &ev_ocl); @@ -543,7 +547,7 @@ event::ptr gpu_usm::fill(stream& stream) { GPU_DEBUG_TRACE_DETAIL << "Skip EnqueueMemcpy for 0 size tensor" << std::endl; return stream.create_user_event(true); } - return fill(stream, 0); + return fill(stream, 0, blocking); } event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) { diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp index e2a68537cdc69e..e37518de3982a8 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp @@ -32,8 +32,8 @@ struct gpu_buffer : public lockable_gpu_mem, public memory { void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override; void unlock(const stream& stream) override; - event::ptr fill(stream& stream, unsigned char pattern) override; - event::ptr fill(stream& stream) override; + event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override; + event::ptr fill(stream& stream, bool blocking = true) override; shared_mem_params get_internal_params() const override; const cl::Buffer& get_buffer() const { assert(0 == _lock_count); @@ -58,8 +58,8 @@ struct gpu_image2d : public lockable_gpu_mem, public memory { void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override; void unlock(const stream& stream) override; - event::ptr fill(stream& stream, unsigned char pattern) override; - event::ptr fill(stream& stream) override; + event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override; + event::ptr fill(stream& stream, bool blocking = true) override; shared_mem_params get_internal_params() const override; const cl::Image2D& get_buffer() const { assert(0 == _lock_count); @@ -112,8 +112,8 @@ struct gpu_usm : public lockable_gpu_mem, public memory { cl::UsmMemory& get_buffer() { return _buffer; } void* buffer_ptr() const override { return _buffer.get(); } - event::ptr fill(stream& stream, unsigned char pattern) override; - event::ptr fill(stream& stream) override; + event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override; + event::ptr fill(stream& stream, bool blocking = true) override; shared_mem_params get_internal_params() const override; event::ptr copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override; diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp index 9565036f7b452d..741014b461e7f0 100644 --- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp @@ -44,5 +44,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_RoPETestLlama2, ::testing::Values(ov::test::utils::DEVICE_GPU), RoPETestLlama2Slice::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLM, + RoPETestChatGLM2DRoPEStridedSlice, + ::testing::Values(ov::test::utils::DEVICE_GPU), + RoPETestChatGLM2DRoPEStridedSlice::getTestCaseName); + + } // namespace test } // namespace ov diff --git a/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp index d4c50ec84ac78a..5d259a1a1862fc 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp @@ -38,8 +38,10 @@ class EltwiseFusingTest : public ::BaseFusingTest { network network_fused(this->engine, this->topology_fused, cfg_fused); auto inputs = network_fused.get_input_ids(); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); + if (std::find(inputs.begin(), inputs.end(), "input") != inputs.end()) { + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + } if (std::find(inputs.begin(), inputs.end(), "input2") != inputs.end()) { network_fused.set_input_data("input2", input_prim2); network_not_fused.set_input_data("input2", input_prim2); @@ -699,3 +701,27 @@ TEST_P(eltwise_fusing_reorders, reorders_for_data_type) { INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fusing_reorders, ::testing::ValuesIn(std::vector{ eltwise_test_params{ { 1, 16, 16, 2 }, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::max, 4, 6 }, })); + +class eltwise_with_constant_input : public EltwiseFusingTest {}; +TEST_P(eltwise_with_constant_input, basic) { + auto p = GetParam(); + create_topologies(data("eltwise_data", get_mem(get_input_layout2(p), -10, 10)), + data("eltwise_data1", get_mem(get_input_layout2(p), -10, 10)), + eltwise("eltwise", {input_info("eltwise_data"), input_info("eltwise_data1")}, p.mode, p.default_type), + reorder("out", + input_info("eltwise"), + p.default_format, + data_types::f32, + std::vector(), + cldnn::reorder_mean_mode::subtract, + cldnn::padding(), + true) + ); + + tolerance = default_tolerance(p.input_type); + execute(p, true); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_with_constant_input, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP16_1, 0, 0}, +})); diff --git a/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp b/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp index a590fb9299a777..eb0f63c651e50d 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/fusion_test_common.hpp @@ -81,8 +81,9 @@ class BaseFusingTest : public ::testing::TestWithParam { ASSERT_EQ(outputs_ref.size(), outputs_fused.size()); ASSERT_EQ(outputs_ref.size(), size_t(1)); + std::vector val_opt; auto val_ref = get_output_values_to_float(not_fused, outputs_ref.begin()->second); - auto val_opt = get_output_values_to_float(fused, outputs_fused.begin()->second); + ASSERT_NO_THROW(val_opt = get_output_values_to_float(fused, outputs_fused.begin()->second)); ASSERT_EQ(val_ref.size(), val_opt.size()); for (size_t i = 0; i < val_ref.size(); i++) { ASSERT_NEAR(val_ref[i], val_opt[i], tolerance) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/non_zero_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/non_zero_gpu_test.cpp index 37a1ba8b982414..80122193265ebc 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/non_zero_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/non_zero_gpu_test.cpp @@ -556,7 +556,7 @@ TEST(non_zero_gpu, empty_input) { // Put some value into out buffer to ensure that it's non empty // That is needed to ensure that implementation correctly handles the cases when input tensor is empty and set count non zero to 0 - count_nonzero_inst->output_memory(0).fill(engine.get_service_stream(), 1); + count_nonzero_inst->output_memory(0).fill(engine.get_service_stream(), 1, true); engine.get_service_stream().finish(); auto count_nonzero_impl = count_nonzero_inst->get_impl(); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp index 65ec475df6b986..fef9470545482a 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp @@ -30,7 +30,7 @@ void registerNPUWOptions(OptionsDesc& desc); DEFINE_OPT(NPU_USE_NPUW, bool, false, use_npuw, CompileTime); DEFINE_OPT(NPUW_DEVICES, std::string, "NPU,CPU", npuw::devices, CompileTime); DEFINE_OPT(NPUW_SUBMODEL_DEVICE, std::string, "", npuw::submodel_device, CompileTime); -DEFINE_OPT(NPUW_ONLINE_PIPELINE, std::string, "REP", npuw::partitioning::online::pipeline, CompileTime); +DEFINE_OPT(NPUW_ONLINE_PIPELINE, std::string, "REG", npuw::partitioning::online::pipeline, CompileTime); DEFINE_OPT(NPUW_ONLINE_AVOID, std::string, "", npuw::partitioning::online::avoid, CompileTime); DEFINE_OPT(NPUW_ONLINE_ISOLATE, std::string, "", npuw::partitioning::online::isolate, CompileTime); DEFINE_OPT(NPUW_ONLINE_NO_FOLD, std::string, "", npuw::partitioning::online::nofold, CompileTime); diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp index 31fa52c3878598..059977ee47a063 100644 --- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp @@ -67,8 +67,8 @@ namespace online { * @brief * Type: std::string. * Specify which partitioning pipeline to run. - * Possible values: "NONE", "INIT", "JUST", "REP", "COMPUTE". - * Default value: "REP". + * Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE". + * Default value: "REG". */ static constexpr ov::Property pipeline{"NPUW_ONLINE_PIPELINE"}; diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 2fe90eb82c41bb..a312a806cac4bc 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -516,11 +516,6 @@ std::string ov::npuw::CompiledModel::global_mem_device() const { } std::string ov::npuw::CompiledModel::funcall_mem_device(const std::size_t idx) const { - // FIXME: currently we allocate intermediate tensors for EVERY submodel. - // It's not feasible to allocate them in L0 due to high memory consumption. - // Until we make such memory reusable, hard-coding those tensors to CPU. - return "CPU"; - // Force globally set device if set const std::string device_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>(); if (!device_alloc.empty()) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 4152d08275ba6d..038c1bb176b029 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -46,6 +46,8 @@ class CompiledModel : public ov::ICompiledModel { // FIXME: This class has many friends.. friend class IBaseInferRequest; friend class JustInferRequest; + friend class MemAccessSim; + friend class FuncMemMgr; bool compile_for_success(std::size_t id); bool compile_for_device(std::size_t id, const std::string& device_to_try); diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index fbbabf083bccd8..c4e2c3ee98b676 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -20,8 +20,173 @@ #include "util.hpp" #include "weights_bank.hpp" +ov::npuw::MemAccessSim::MemAccessSim(const std::shared_ptr& compiled_model) { + LOG_VERB("Running memory access simulation..."); + LOG_BLOCK(); + + // Initialize the read list + m_read_list.resize(compiled_model->m_compiled_submodels.size()); + + // Initialize read counters for tensors in the graph: + // 1. Interconnect + for (const auto& kvp : compiled_model->m_submodels_input_to_prev_output) { + const auto& read_to = kvp.first; // who reads + const auto& read_from = kvp.second; // reads what + + if (read_to == CompiledModel::NO_LINK || read_from == CompiledModel::NO_LINK) { + continue; + } + + // Record # of reads for this particular Source + m_remaining_reads[read_from]++; + + // Record a read request for this particular Subgraph (who reads the Source) + m_read_list[read_to.first].push_back(read_from); + } + // 2. Global model's outputs + for (auto&& read_from : compiled_model->m_outputs_to_submodels_outputs) { + m_remaining_reads[read_from]++; + } + + LOG_VERB("Done"); +} + +const ov::npuw::MemAccessSim::ReadList& ov::npuw::MemAccessSim::read_list(std::size_t idx) const { + return m_read_list.at(idx); +} + +std::size_t ov::npuw::MemAccessSim::remaining_reads(const LinkFrom& from) { + return m_remaining_reads.at(from); +} + +void ov::npuw::MemAccessSim::register_read(const LinkFrom& from) { + m_remaining_reads.at(from)--; +} + +ov::npuw::FuncMemMgr::FuncMemMgr(const std::shared_ptr& compiled_model) + : m_sim(compiled_model), + m_model(compiled_model) {} + +void ov::npuw::FuncMemMgr::set_alloc(AllocFcn&& fcn) { + m_alloc = std::move(fcn); +} + +void ov::npuw::FuncMemMgr::assign_memory() { + LOG_VERB("Assigning function memory..."); + LOG_BLOCK(); + + const auto num_submodels = m_model->m_compiled_submodels.size(); + + // Walk over the subgraphs, pre-allocate and pre-assign tensors to the subgraphs + // outputs. + for (std::size_t idx = 0u; idx < num_submodels; idx++) { + LOG_VERB("Process Subgraph[" << idx << "]"); + LOG_BLOCK(); + const auto& comp_model_desc = m_model->m_compiled_submodels[idx]; + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + // no model & no funcall - optimized out, do nothing + continue; + } + + // Simulate subgraph execution: poll its input list first + const auto& read_list = m_sim.read_list(idx); + + // Now, get the outputs for the subgraph. If it is "regular", there's + // nothing to do - this subgraph owns its outputs on its own. + // If it is a function, though - look up in the function's memory storage. + if (comp_model_desc.replaced_by) { + const auto real_idx = comp_model_desc.replaced_by.value(); + const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx]; + + const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size(); + for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) { + const LinkFrom this_out = LinkFrom{idx, out_idx}; + assign(this_out); + } + } + + // Here happens the imaginary execution... Hocus pocus, done - that's a + // simulation after all + // After the execution, mark that the read_list was read. + for (auto&& from : read_list) { + m_sim.register_read(from); + } + LOG_VERB("Done"); + } + + // Report memory residency + for (auto&& m : m_memory) { + LOG_VERB("Function " << m.first.first << "/out port " << m.first.second << " : maximum memory residency " + << m.second.size() << " tensor(s)"); + } + + LOG_VERB("Done"); +} + +void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) { + // This method is the center of the function memory management. + // The logic is simple: + // - Look for an output tensor to reuse + // - If there's one, assign it to this allocation + // - If there's none, allocate a new tensor + // - How a tensor to reuse is piced: + // 1. It should exist + // 2. It's "remaining reads" count should be 0 (all planned reads + // happened at this point). + // The tensor storage is organized like this: + // - Function: Here we use .replaced_by as a function identifier; taken from `from` + // - Output index: taken from `from` + // - A vector of resident tensors + + LOG_VERB("Assinging tensor for Subgraph[" << from.first << "]/" << from.second << "..."); + LOG_BLOCK(); + + const auto& comp_model_desc = m_model->m_compiled_submodels[from.first]; + NPUW_ASSERT(comp_model_desc.replaced_by.has_value()); + + const auto real_idx = comp_model_desc.replaced_by.value(); + + FO func_output = {real_idx, from.second}; + auto& assigned_memory = m_memory[func_output]; + auto asgn_iter = std::find_if(assigned_memory.begin(), assigned_memory.end(), [&](Assignment& a) { + return m_sim.remaining_reads(a.from) == 0u; + }); + if (asgn_iter != assigned_memory.end()) { + // Reassign this memory slot to the new "from" + asgn_iter->from = from; + m_table[from] = asgn_iter->ptr; + } else { + // No free space at this point - allocate a new tensor + const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx]; + const auto& proto_comp_model = proto_comp_model_desc.compiled_model; + + const auto& oport = proto_comp_model->outputs()[from.second]; + ov::Shape oshape = oport.get_shape(); + + if (proto_comp_model_desc.spatial) { + oshape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range; + } + const auto& device = m_model->funcall_mem_device(real_idx); + TensorPtr new_tensor = m_alloc(oport.get_element_type(), oshape, device); + NPUW_ASSERT(new_tensor); + + assigned_memory.push_back(Assignment{new_tensor, from}); + m_table[from] = new_tensor; + } + LOG_VERB("Done"); +} + +ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) { + return m_table.at(from); +} + ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr& compiled_model) - : IBaseInferRequest(compiled_model) { + : IBaseInferRequest(compiled_model), + m_func_mem_mgr(compiled_model) { + using namespace std::placeholders; + m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3)); + m_func_mem_mgr.assign_memory(); + m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>(); if (m_use_function_pipelining) { LOG_WARN("Function call pipelining is enabled for " << m_npuw_model->m_name @@ -67,27 +232,20 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrparams) { const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx]; m_spatial_io[real_idx].input_tails[p.idx] = - allocTensor(iport, m_npuw_model->funcall_mem_device(real_idx)); + allocOut(iport, m_npuw_model->funcall_mem_device(real_idx)); } const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size(); for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) { const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx]; m_spatial_io[real_idx].output_tails[out_idx] = - allocTensor(oport, m_npuw_model->funcall_mem_device(real_idx)); + allocOut(oport, m_npuw_model->funcall_mem_device(real_idx)); } } } // if(spatial) for (size_t out_idx = 0; out_idx < num_outputs; out_idx++) { - const auto& port = proto_comp_model->outputs()[out_idx]; - ov::Shape shape = port.get_shape(); - - // If the subgraph is spatial, promote the output size to the full vector size - if (proto_comp_model_desc.spatial) { - shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range; - } - m_funcall_result[LinkFrom{i, out_idx}] = - allocTensor(port.get_element_type(), shape, m_npuw_model->funcall_mem_device(real_idx)); + const auto from = LinkFrom{i, out_idx}; + m_funcall_result[from] = m_func_mem_mgr.get_tensor(from); } if (real_idx != i) { // If this function call is NOT the function body, do nothing here - the original @@ -152,7 +310,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrinputs().size(); i++) { const auto& port = m_npuw_model->inputs()[i]; - ov::SoPtr allocated = allocTensor(port, m_npuw_model->global_mem_device()); + ov::SoPtr allocated = allocOut(port, m_npuw_model->global_mem_device()); m_input_tensors.push_back(allocated); m_input_allocated.insert(allocated->data()); m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true}; @@ -174,7 +332,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrsecond // Function calls have their tensors allocated, so just use one - : allocTensor(port, m_npuw_model->global_mem_device()); + : allocOut(port, m_npuw_model->global_mem_device()); m_output_tensors.push_back(tensor); m_port_to_tensor[port] = TensorStorage{tensor, true}; @@ -920,27 +1078,22 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool } // if (replaced_by) } -ov::SoPtr ov::npuw::JustInferRequest::allocTensor(const ov::element::Type type, - const ov::Shape& shape, - const std::string& device) { +ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type, + const ov::Shape& shape, + const std::string& device) { if (device == "CPU" || ov::shape_size(shape) == 0) { return ov::get_tensor_impl(ov::Tensor(type, shape)); } - ov::SoPtr remote_tensor; - ov::Tensor allocated_tensor; - { - std::lock_guard guard(m_alloc_mutex); - m_remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr; - remote_tensor = m_remote_ctx->create_host_tensor(type, shape); - allocated_tensor = ov::make_tensor(remote_tensor); - } - return ov::get_tensor_impl(allocated_tensor); + std::lock_guard guard(m_alloc_mutex); + auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr; + auto remote_tensor = remote_ctx->create_host_tensor(type, shape); + return ov::get_tensor_impl(ov::make_tensor(remote_tensor)); } -ov::SoPtr ov::npuw::JustInferRequest::allocTensor(const ov::Output& node, - const std::string& device) { - return allocTensor(node.get_element_type(), node.get_shape(), device); +ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output& node, + const std::string& device) { + return allocMem(node.get_element_type(), node.get_shape(), device); } void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp index 7335b54c30062e..88838d8b39d75f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp @@ -22,6 +22,56 @@ namespace npuw { class CompiledModel; class AsyncInferRequest; +using LinkFrom = std::pair; // FIXME: This is a third, if not fourth, definitiion of such structure + +using TensorPtr = ov::SoPtr; + +class MemAccessSim { +public: + explicit MemAccessSim(const std::shared_ptr& compiled_model); + + using ReadList = std::list; + const ReadList& read_list(std::size_t idx) const; + + std::size_t remaining_reads(const LinkFrom& from); + void register_read(const LinkFrom& from); + +private: + std::map m_remaining_reads; + std::vector m_read_list; +}; + +class FuncMemMgr { + MemAccessSim m_sim; + std::shared_ptr m_model; + + void assign(const LinkFrom& from); + + // Function ID -> Output port number + using FO = std::pair; + struct Assignment { + TensorPtr ptr; + LinkFrom from; + }; + std::map> m_memory; // Dynamic assignment table + std::map m_table; // Static allocation/assignment table + +public: + explicit FuncMemMgr(const std::shared_ptr& compiled_model); + + using AllocFcn = std::function; + void set_alloc(AllocFcn&& fcn); + void assign_memory(); + + TensorPtr get_tensor(const LinkFrom& from); + +private: + AllocFcn m_alloc; +}; + class JustInferRequest final : public IBaseInferRequest { public: explicit JustInferRequest(const std::shared_ptr& compiled_model); @@ -64,15 +114,11 @@ class JustInferRequest final : public IBaseInferRequest { void connect_subrequests(); void recreate_subrequests(std::size_t idx); - ov::SoPtr allocTensor(const ov::element::Type type, const ov::Shape& shape, const std::string& device); - ov::SoPtr allocTensor(const ov::Output& node, const std::string& device); + TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device); + TensorPtr allocOut(const ov::Output& node, const std::string& device); - using LinkFrom = std::pair; // FIXME: This is a third, if not fourth, definitiion of such structure - using TensorPtr = ov::SoPtr; - std::map m_funcall_result; + FuncMemMgr m_func_mem_mgr; // Owns memory + std::map m_funcall_result; // Provides a convenient link bool is_pipelined(std::size_t idx) const; bool m_use_function_pipelining = false; @@ -103,8 +149,6 @@ class JustInferRequest final : public IBaseInferRequest { std::vector m_subrequests_gio; std::mutex m_alloc_mutex; - std::shared_ptr m_remote_ctx = nullptr; - std::unordered_set m_input_allocated; }; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp index 46b6cb7b12681d..a66159e6b4d1b7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp @@ -40,7 +40,6 @@ std::vector getIsolates(const std::string& isolates_unparsed); std::vector getNoFolds(::intel_npu::Config& cfg); std::vector getNoFolds(const std::string& nofolds_unparsed); // Set default predefined values for COMPUTE pipeline -void setComputeConfig(PassContext& ctx); void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to); size_t getMinGraphSize(::intel_npu::Config& cfg) { @@ -204,12 +203,6 @@ std::vector getNoFolds(const std::string& nofolds_unparsed) { return nofolds; } -void setComputeConfig(PassContext& ctx) { - // FIXME: initialize via a dedicated function instead of parsing - ctx.isolates = detail::getIsolates(ISOL_PRESETS.at("COMPUTE")); - ctx.nofolds = detail::getNoFolds("compute"); -} - void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) { pugi::xml_document doc; @@ -277,10 +270,21 @@ class Compiler { NONE, // Partitioning will consist of a single group with all the Ops INIT, // Initialize only. The hardest mode, every group has just 1 layer inside JUST, // "justParitioning" - combination of LHF + Remnants - REP, // Repeated blocks pipeline - combination of repeatedBlocks and Remnants - default configuration + REP, // Repeated blocks pipeline - combination of repeatedBlocks and Remnants + REG, // Regularized repeated blocks pipeline -same as REP, but with some strong hints first COMPUTE // Separates non-foldable compute subgraphs from the model based on predefined rules + REP }; + template + void warn_unused() { + const auto& val = m_cfg.get(); + if (val != C::defaultValue()) { + LOG_WARN("User-specified configuration {" << C::key() << " : " << val + << "} is ignored in the current pipeline " + << m_cfg.get<::intel_npu::NPUW_ONLINE_PIPELINE>()); + } + } + Pipeline currentPipeline() { std::string pipeline_opt = m_cfg.getString<::intel_npu::NPUW_ONLINE_PIPELINE>(); if (pipeline_opt == "NONE") { @@ -291,6 +295,8 @@ class Compiler { return Pipeline::JUST; } else if (pipeline_opt == "REP") { return Pipeline::REP; + } else if (pipeline_opt == "REG") { + return Pipeline::REG; } else if (pipeline_opt == "COMPUTE") { return Pipeline::COMPUTE; } else { @@ -346,6 +352,23 @@ class Compiler { LOG_INFO("Done"); } + void reg() { + LOG_INFO("Online partitioning: compiling regularized repeated blocks pipeline..."); + LOG_BLOCK(); + + m_snapshot->earlyAvoids(); + m_snapshot->earlyRegroup(); + m_snapshot->repeatedBlocks([&]() { + // This callback is called when repeatingBlocks algorithm thinks it is done + m_snapshot->stripTag("compute"); + }); + m_snapshot->repeat([&] { + m_snapshot->fuseRemnantsExtended(); + }); + + LOG_INFO("Done"); + } + public: Compiler(const std::shared_ptr& model, ::intel_npu::Config& cfg) : m_model(model), @@ -384,9 +407,24 @@ class Compiler { case Pipeline::REP: rep(); break; + case Pipeline::REG: + warn_unused<::intel_npu::NPUW_ONLINE_ISOLATE>(); + + // Only get isolates here. + // NB: We ignore NO_FOLD everywhere except pipeline COMPUTE - this needs + // to be aligned in the future + ctx.isolates = detail::getIsolates(detail::ISOL_PRESETS.at("COMPUTE")); + m_snapshot->setCtx(ctx); + reg(); + break; case Pipeline::COMPUTE: + warn_unused<::intel_npu::NPUW_ONLINE_ISOLATE>(); + warn_unused<::intel_npu::NPUW_ONLINE_NO_FOLD>(); + // Manually set predefined isolates and nofolds then do rep() pipeline - detail::setComputeConfig(ctx); + // FIXME: initialize via a dedicated function instead of parsing + ctx.isolates = detail::getIsolates(detail::ISOL_PRESETS.at("COMPUTE")); + ctx.nofolds = detail::getNoFolds("compute"); m_snapshot->setCtx(ctx); rep(); break; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp index 991330663bbe48..cfa9e451ffb149 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp @@ -443,6 +443,10 @@ void Group::isolate(const std::string& tag) { m_isol_tag = tag; } +void Group::dontIsolate() { + m_isol_tag = ""; +} + const std::string& Group::isolatedTag() const { return m_isol_tag; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp index 69688248a0b9ac..538eeb03bc851c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp @@ -77,6 +77,7 @@ class Group : public std::enable_shared_from_this { // FIXME: unify avoid and isolate void avoid(const std::string& device); void isolate(const std::string& tag); + void dontIsolate(); const std::set& avoidedTargets() const; const std::string& isolatedTag() const; std::string specialTags() const; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp index 82856cece3de40..4cdc92ffc92d25 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp @@ -436,18 +436,27 @@ void Snapshot::earlyRegroup() { LOG_INFO("DONE."); } -void Snapshot::repeatedBlocks() { +void Snapshot::repeatedBlocks(Snapshot::CB&& on_done) { LOG_INFO("Online partitioning: executing repeatedBlocks pass group..."); LOG_BLOCK(); identifyUniques(); repeat([&] { repeat([&] { - mergeUniques(); + repeat([&] { + mergeUniques(); + }); + mergeTriangles(); + markInternalCompute(); + resetExcludedRep(); }); - mergeTriangles(); - markInternalCompute(); - resetExcludedRep(); + // While the current process is entirely done, let the caller + // influence the partitioning - so the algorithm could continue. + if (on_done) { + on_done(); + } else { + return; // FROM top-level repeat! + } }); cleanUpUniques(); @@ -1086,3 +1095,12 @@ void Snapshot::repeat(detail::Pass&& pass) { void Snapshot::setCtx(const ov::npuw::online::PassContext& ctx) { m_ctx = ctx; } + +void Snapshot::stripTag(const std::string& tag) { + for (auto&& nh : m_graph->nodes()) { + auto gptr = m_graph->meta(nh).get(); + if (gptr->isolatedTag() == tag) { + gptr->dontIsolate(); + } + } +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp index e7e5121b1240e7..6da1a6d98939bb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp @@ -46,10 +46,13 @@ class Snapshot : public std::enable_shared_from_this { void fuseInputs(); // Advanced passes for repeated blocks algorithm - void repeatedBlocks(); + using CB = std::function; + void repeatedBlocks(CB&& on_done = {}); void earlyAvoids(); void earlyRegroup(); + void stripTag(const std::string& tag); + // Utility std::shared_ptr getGraph() const; const detail::OVPortsMap& getPortsMap() const; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index c9a162421fe243..077fb6d6660132 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -335,7 +335,7 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { auto qcvtw = opp::wrap_type({qweight}); auto qmuls = opp::wrap_type({qcvtw, qcoeff}); auto qreshp = opp::wrap_type({qmuls, opp::any_input()}); - auto qcvtr = opp::wrap_type({qreshp}); + auto qcvtr = opp::optional({qreshp->output(0)}); auto qmmi = opp::any_input(); auto qmm = opp::wrap_type({qmmi, qcvtr}); @@ -409,13 +409,18 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { auto rshp_ccat = std::make_shared(scaled, rshp_ccat_c, false); auto reduce_axis = std::make_shared(ov::element::i32, ov::Shape{}, 1); - auto reduce = std::make_shared(rshp_ccat, reduce_axis, true); + // Make reduceSum not to keep axis because then it will convert to poolings in compiler. + // Otherwise reduceSum will convert to the convolution which is less efficient than poolings. + auto reduce = std::make_shared(rshp_ccat, reduce_axis, false); auto rshp_out_c = std::make_shared(ov::element::i32, ov::Shape{3}, out_shape); auto rshp_out = std::make_shared(reduce, rshp_out_c, false); - // Convert the result to f32 to maintain the graph contracts. FIXME should be avoided - auto out = std::make_shared(rshp_out, ov::element::f32); + // Convert the result to f32 to maintain the graph contracts if required. + std::shared_ptr out = rshp_out; + if (matched_matmul->get_element_type() == ov::element::f32) { + out = std::make_shared(rshp_out, ov::element::f32); + } // Now.. Reconnect the matmul readers to the new output (reducesum) for (auto&& r : matched_matmul->output(0).get_target_inputs()) { @@ -690,11 +695,6 @@ DQParMMGQ::DQParMMGQ(Context::Ref ctx) { return false; } - if (qmmi_shape[1] != 1 && !ctx.get().is_spatial) { - // For non 1-token cases, do transformation if and only if and only if the block is spatial - return false; - } - if (!matmul->get_transpose_a() && !matmul->get_transpose_b()) { ctx.get().register_parallel_matmul(node_to_output.at(qmmi), 2, Context::DQParMM{w_param, s_param, matmul}); } else if (!matmul->get_transpose_a() && matmul->get_transpose_b()) { @@ -752,7 +752,7 @@ void mergeParallelMatMuls(const std::shared_ptr& m, Context& ctx) { auto new_cvt = std::make_shared(new_w, new_s->get_element_type()); std::shared_ptr new_mul = std::make_shared(new_cvt, new_s); - if (new_s->get_element_type() == ov::element::f16) { + if ((new_s->get_element_type() == ov::element::f16) && (orig_multiply.get_element_type() == ov::element::f32)) { new_mul = std::make_shared(new_mul, ov::element::f32); } auto new_w_shape = new_w->get_shape(); diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp index 093e3235afb78f..b55d39bead49bb 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp +++ b/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp @@ -489,30 +489,24 @@ TEST_F(BehaviorTestsNPUWOnlinePartitioning, FoldingAndPipelining) { EXPECT_COMPILE_MODEL(mock_cpu, TIMES(0)); } - for (int i = 0; i < 3; i++) { - // Here we will create 2 infer requests per model, - // so `create_sync_infer_request()` should be called twice - // per model: - EXPECT_CREATE_SYNC_INFER_REQ(mock_npu, MODEL(i), TIMES(2)); - } - - // 1st model 1st infer request is called once -- head - EXPECT_INFER_FOR(mock_npu, MODEL(0), INFER_REQ(0), TIMES(1)); - // 1st model 2nd infer request is never called, - // it is not a function and is not repeated - EXPECT_INFER_FOR(mock_npu, MODEL(0), INFER_REQ(1), TIMES(0)); + // 1 infer request for head: + EXPECT_CREATE_SYNC_INFER_REQ(mock_npu, MODEL(0), TIMES(1)); + // 2 infer requests for function, `create_sync_infer_request()` + // should be called twice here: + EXPECT_CREATE_SYNC_INFER_REQ(mock_npu, MODEL(1), TIMES(2)); + // 1 infer request for tail: + EXPECT_CREATE_SYNC_INFER_REQ(mock_npu, MODEL(2), TIMES(1)); + + // Head's infer request is called once: + EXPECT_INFER(mock_npu, MODEL(0), TIMES(1)); - // Repeated block - // 2nd model 1st infer request is called 5 times + // Repeated block's model 1st infer request is called 5 times: EXPECT_INFER_FOR(mock_npu, MODEL(1), INFER_REQ(0), TIMES(5)); - // 2nd model 2nd infer request (brother of 1st one) is called 5 times + // Repeated block's model 2nd infer request (brother of 1st one) is called 5 times: EXPECT_INFER_FOR(mock_npu, MODEL(1), INFER_REQ(1), TIMES(5)); - // 3rd model 1st infer request is called once -- tail - EXPECT_INFER_FOR(mock_npu, MODEL(2), INFER_REQ(0), TIMES(1)); - // 3rd model 2nd infer request is never called, - // it is not a function and is not repeated - EXPECT_INFER_FOR(mock_npu, MODEL(2), INFER_REQ(1), TIMES(0)); + // Tail's infer request is called once: + EXPECT_INFER(mock_npu, MODEL(2), TIMES(1)); // Register mock objects as plugins in OpenVINO: register_mock_plugins_in_ov(); diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.cpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.cpp index ed4bf72a945f79..950d80b279324f 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.cpp +++ b/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.cpp @@ -243,9 +243,10 @@ void MockPluginBase::create_implementation() { .WillByDefault([](const ov::AnyMap& remote_properties) -> ov::SoPtr { OPENVINO_NOT_IMPLEMENTED; }); + // This method is utilized for remote tensor allocation in NPUW JustInferRequest and Weight bank. ON_CALL(*this, get_default_context) .WillByDefault([](const ov::AnyMap& remote_properties) -> ov::SoPtr { - OPENVINO_NOT_IMPLEMENTED; + return std::make_shared(device_name); }); ON_CALL(*this, import_model(testing::_, testing::_)) .WillByDefault([](std::istream& model, const ov::AnyMap& properties) diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.hpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.hpp index 4d720796c6abbf..e8f9e134fcb324 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.hpp +++ b/src/plugins/intel_npu/tests/functional/behavior/npuw/mocks/mock_plugins.hpp @@ -23,6 +23,23 @@ namespace ov { namespace npuw { namespace tests { +// Need for remote tensor allocation in NPUW JustInferRequest and Weight bank. +// They utilize "create_host_tensor()" method. +// TODO: Mock "create_host_tensor()" method and add tests for it. +class MockRemoteContext : public ov::IRemoteContext { + std::string m_name; + +public: + MockRemoteContext(std::string name) : m_name(std::move(name)) {} + const std::string& get_device_name() const override { + return m_name; + } + MOCK_METHOD(ov::SoPtr, + create_tensor, + (const ov::element::Type&, const ov::Shape&, const ov::AnyMap&)); + MOCK_METHOD(const ov::AnyMap&, get_property, (), (const)); +}; + class MockCompiledModelBase; using MockCompiledModel = testing::NiceMock; diff --git a/src/plugins/template/tests/functional/op_reference/proposal.cpp b/src/plugins/template/tests/functional/op_reference/proposal.cpp index aa49a6b7330166..435a279588af07 100644 --- a/src/plugins/template/tests/functional/op_reference/proposal.cpp +++ b/src/plugins/template/tests/functional/op_reference/proposal.cpp @@ -88,10 +88,6 @@ struct ProposalV4Params { const int feature_stride, const int pre_nms_topn, const int post_nms_topn, - const size_t image_shape_num, - const size_t image_h, - const size_t image_w, - const size_t image_z, const std::vector& ratios, const std::vector& scales, const size_t batch_size, @@ -101,19 +97,22 @@ struct ProposalV4Params { const ov::element::Type& iType, const std::vector& clsScoreValues, const std::vector& bboxPredValues, + const std::vector& inputInfoValues, const std::vector& proposalValues, const std::vector& probsValues, + const std::string& framework, const std::string& test_name = "") : inType(iType), outType(iType), clsScoreData(CreateTensor(iType, clsScoreValues)), bboxPredData(CreateTensor(iType, bboxPredValues)), + imageInfoData(CreateTensor(iType, inputInfoValues)), refProposalData(CreateTensor(Shape{batch_size * post_nms_topn, 5}, iType, proposalValues)), refProbsData(CreateTensor(Shape{batch_size * post_nms_topn}, iType, probsValues)), testcaseName(test_name) { clsScoreShape = Shape{batch_size, anchor_num * 2, feat_map_height, feat_map_width}; bboxPredShape = Shape{batch_size, anchor_num * 4, feat_map_height, feat_map_width}; - imageShapeShape = Shape{image_shape_num}; + imageInfoShape = Shape{inputInfoValues.size()}; attrs.base_size = min_bbox_size; attrs.min_size = min_bbox_size; @@ -129,25 +128,19 @@ struct ProposalV4Params { attrs.normalize = false; attrs.box_size_scale = 1.0f; attrs.box_coordinate_scale = 1.0f; - attrs.framework = ""; + attrs.framework = framework; attrs.infer_probs = true; - - std::vector inputShapeValues; - inputShapeValues.push_back(static_cast(image_h)); - inputShapeValues.push_back(static_cast(image_w)); - inputShapeValues.push_back(static_cast(image_z)); - imageShapeData = CreateTensor(iType, inputShapeValues); } ov::op::v4::Proposal::Attributes attrs; ov::PartialShape clsScoreShape; ov::PartialShape bboxPredShape; - ov::PartialShape imageShapeShape; + ov::PartialShape imageInfoShape; ov::element::Type inType; ov::element::Type outType; ov::Tensor clsScoreData; ov::Tensor bboxPredData; - ov::Tensor imageShapeData; + ov::Tensor imageInfoData; ov::Tensor refProposalData; ov::Tensor refProbsData; std::string testcaseName; @@ -192,7 +185,7 @@ class ReferenceProposalV4LayerTest : public testing::TestWithParam& obj) { @@ -200,9 +193,11 @@ class ReferenceProposalV4LayerTest : public testing::TestWithParam CreateFunction(const ProposalV4Params& params) { const auto class_probs_param = std::make_shared(params.inType, params.clsScoreShape); const auto bbox_deltas_param = std::make_shared(params.inType, params.bboxPredShape); - const auto image_shape_param = std::make_shared(params.inType, params.imageShapeShape); + const auto image_shape_param = std::make_shared(params.inType, params.imageInfoShape); const auto Proposal = std::make_shared(class_probs_param, bbox_deltas_param, image_shape_param, params.attrs); return std::make_shared(Proposal->outputs(), @@ -235,21 +230,21 @@ std::vector generateProposalV1Params() { std::vector proposalV1Params{ ProposalV1Params( - 0.7f, - 16, - 16, - 6000, - 10, // iou_threshold, min_nnox_size, feature_stride,pre_nms_topn, post_nms_topn - 3, - 210, - 350, - 1, // image_shape_num, image_h, image_w, image_z + 0.7f, // iou_threshold + 16, // min_nnox_size + 16, // feature_stride + 6000, // pre_nms_topn + 10, // post_nms_topn + 3, // image_shape_num + 210, // image_h + 350, // image_w + 1, // image_z {0.5f}, // ratios {32.0f}, // scales - 1, - 1, - 10, - 10, // batch_size, anchor_num, feat_map_height, feat_map_width + 1, // batch_size + 1, // anchor_num + 10, // feat_map_height + 10, // feat_map_width IN_ET, std::vector{ 0.000240f, 0.003802f, 0.111432f, 0.000503f, 0.007887f, 0.144701f, 0.399074f, 0.004680f, // 0 @@ -351,22 +346,18 @@ std::vector generateProposalV4Params() { using T = typename element_type_traits::value_type; std::vector proposalV4Params{ - ProposalV4Params( - 0.7f, - 16, - 16, - 6000, - 10, // iou_threshold, min_nnox_size, feature_stride,pre_nms_topn, post_nms_topn - 3, - 210, - 350, - 1, // image_shape_num, image_h, image_w, image_z + ProposalV4Params{ + 0.7f, // iou_threshold + 16, // min_bbox_size + 16, // feature_stride + 6000, // pre_nms_topn + 10, // post_nms_topn {0.5f}, // ratios {32.0f}, // scales - 1, - 1, - 10, - 10, // batch_size, anchor_num, feat_map_height, feat_map_width + 1, // batch_size + 1, // anchor_num + 10, // feat_map_height + 10, // feat_map_width IN_ET, std::vector{ 0.000240f, 0.003802f, 0.111432f, 0.000503f, 0.007887f, 0.144701f, 0.399074f, 0.004680f, // 0 @@ -447,6 +438,7 @@ std::vector generateProposalV4Params() { 0.026623f, 0.117951f, -0.076234f, -0.811997f, 0.01301f, 0.020042f, 0.173756f, -0.036191f, -0.068887f, 0.0229f, 0.245465f, 0.214282f, -0.011054f, 0.132813f, 0.241014f, -0.148763f, }, + std::vector{210, 350, 1}, std::vector{ 0.000000f, 0.000000f, 0.000000f, 349.000000f, 209.000000f, // 0 0.000000f, 0.000000f, 0.000000f, 237.625443f, 209.000000f, // 5 @@ -470,36 +462,135 @@ std::vector generateProposalV4Params() { 0.0008570f, 0.0002190f, 0.0000000f, - }), + }, + ""}, + ProposalV4Params{ + 0.7f, // iou_threshold + 16, // min_bbox_size + 16, // feature_stride + 6000, // pre_nms_topn + 10, // post_nms_topn + {0.5f}, // ratios + {32.0f}, // scales + 1, // batch_size + 1, // anchor_num + 10, // feat_map_height + 10, // feat_map_width + IN_ET, + std::vector{ + 0.000240f, 0.003802f, 0.111432f, 0.000503f, 0.007887f, 0.144701f, 0.399074f, 0.004680f, // 0 + 0.139741f, 0.002386f, 0.030003f, 0.276552f, 0.000267f, 0.022971f, 0.287953f, 0.050235f, // 8 + 0.002580f, 0.206311f, 0.000146f, 0.009656f, 0.175462f, 0.000147f, 0.014718f, 0.272348f, // 16 + 0.065199f, 0.003286f, 0.185335f, 0.003720f, 0.025932f, 0.251401f, 0.001465f, 0.090447f, // 24 + 0.488469f, 0.092259f, 0.019306f, 0.379091f, 0.005311f, 0.010369f, 0.087615f, 0.042003f, // 32 + 0.073871f, 0.416763f, 0.044282f, 0.069776f, 0.313032f, 0.000457f, 0.017346f, 0.089762f, // 40 + 0.000820f, 0.103986f, 0.367993f, 0.026315f, 0.035701f, 0.299252f, 0.000135f, 0.017825f, // 48 + 0.150119f, 0.000076f, 0.050511f, 0.269601f, 0.026680f, 0.003541f, 0.189765f, 0.000051f, // 56 + 0.004315f, 0.193150f, 0.000032f, 0.007254f, 0.185557f, 0.051526f, 0.000657f, 0.117579f, // 64 + 0.000115f, 0.010179f, 0.293187f, 0.000025f, 0.006505f, 0.175345f, 0.032587f, 0.000469f, // 72 + 0.098443f, 0.000121f, 0.009600f, 0.322782f, 0.000032f, 0.004543f, 0.166860f, 0.044911f, // 80 + 0.000187f, 0.102691f, 0.000242f, 0.005502f, 0.107865f, 0.000191f, 0.005336f, 0.086893f, // 88 + 0.078422f, 0.000345f, 0.079096f, 0.000281f, 0.016388f, 0.214072f, 0.000107f, 0.012027f, // 96 + 0.192754f, 0.049531f, 0.000386f, 0.149893f, 0.000374f, 0.016965f, 0.204781f, 0.000163f, // 104 + 0.016272f, 0.215277f, 0.032298f, 0.000857f, 0.133426f, 0.000614f, 0.020215f, 0.165789f, // 112 + 0.000225f, 0.036951f, 0.262195f, 0.087675f, 0.004596f, 0.147764f, 0.000219f, 0.010502f, // 120 + 0.163394f, 0.000152f, 0.023116f, 0.241702f, 0.081800f, 0.002197f, 0.146637f, 0.000193f, // 128 + 0.012017f, 0.133497f, 0.000375f, 0.028605f, 0.309179f, 0.065962f, 0.005508f, 0.155530f, // 136 + 0.000186f, 0.004540f, 0.079319f, 0.000799f, 0.031003f, 0.303045f, 0.051473f, 0.017770f, // 144 + 0.206188f, 0.000202f, 0.004291f, 0.061095f, 0.001109f, 0.018094f, 0.156639f, 0.026062f, // 152 + 0.005270f, 0.148651f, 0.000026f, 0.007300f, 0.096013f, 0.000383f, 0.022134f, 0.129511f, // 160 + 0.080882f, 0.003416f, 0.129922f, 0.000037f, 0.010040f, 0.130007f, 0.000116f, 0.014904f, // 168 + 0.171423f, 0.082893f, 0.000921f, 0.154976f, 0.000142f, 0.016552f, 0.209696f, 0.000227f, // 176 + 0.022418f, 0.228501f, 0.111712f, 0.001987f, 0.158164f, 0.001200f, 0.027049f, 0.308222f, // 184 + 0.001366f, 0.038146f, 0.287945f, 0.072526f, 0.016064f, 0.257895f, 0.000595f, 0.016962f, // 192 + }, + std::vector{ + 0.006756f, -0.055635f, 0.030843f, 0.007482f, 0.009056f, -0.041824f, 0.119722f, 0.168988f, + 0.002822f, 0.039733f, 0.109005f, 0.245152f, -0.013196f, -0.018222f, -0.170122f, -0.374904f, + -0.005455f, -0.034059f, -0.006787f, 0.072005f, -0.017933f, -0.007358f, 0.034149f, 0.123846f, + 0.128319f, 0.016107f, -0.615487f, -1.235094f, -0.024253f, -0.019406f, 0.134142f, 0.157853f, + -0.021119f, 0.007383f, 0.089365f, 0.092854f, 0.062491f, 0.002366f, 0.122464f, -0.003326f, + 0.015468f, -0.034088f, 0.079009f, 0.075483f, 0.011972f, 0.042427f, 0.106865f, 0.158754f, + 0.071211f, -0.034009f, 0.007985f, -0.441477f, 0.009046f, -0.028515f, 0.095372f, 0.119598f, + -0.007553f, -0.0072f, 0.105072f, 0.084314f, 0.23268f, -0.02906f, -0.408454f, -1.13439f, + 0.016202f, -0.037859f, 0.130873f, 0.129652f, 0.002064f, -0.011969f, 0.171623f, 0.050218f, + 0.113831f, 0.028922f, 0.017785f, 0.059708f, 0.037658f, -0.011245f, 0.097197f, 0.137491f, + 0.024218f, 0.04739f, 0.091978f, 0.217333f, 0.088418f, -0.004662f, -0.095168f, -0.397928f, + 0.02639f, -0.008501f, 0.068487f, 0.108465f, 0.020069f, 0.018829f, 0.040206f, 0.068473f, + 0.226458f, -0.072871f, -0.672384f, -1.447558f, 0.039598f, 0.017471f, 0.187288f, 0.08409f, + 0.017152f, -0.00516f, 0.183419f, 0.068469f, 0.063944f, 0.160725f, -0.022493f, -0.132291f, + 0.010542f, 0.036318f, 0.074042f, -0.013323f, 0.00808f, 0.060365f, 0.120566f, 0.21866f, + 0.046324f, 0.088741f, 0.029469f, -0.517183f, 0.00917f, 0.011915f, 0.053674f, 0.140168f, + 0.0033f, 0.022759f, -0.006196f, 0.063839f, 0.083726f, -0.088385f, -0.57208f, -1.454211f, + 0.020655f, 0.010788f, 0.134951f, 0.109709f, 0.015445f, -0.015363f, 0.109153f, 0.051209f, + 0.024297f, 0.139126f, -0.12358f, -0.127979f, 0.004587f, 0.004751f, 0.047292f, 0.027066f, + 0.011003f, 0.069887f, 0.117052f, 0.267419f, 0.039306f, 0.077584f, 0.02579f, -0.496149f, + -0.005569f, 0.015494f, -0.011662f, 0.105549f, -0.007015f, 0.031984f, -0.075742f, 0.0852f, + 0.023886f, -0.053107f, -0.325533f, -1.329066f, 0.004688f, 0.034501f, 0.089317f, 0.042463f, + 0.004212f, -0.015128f, 0.00892f, 0.028266f, 0.009997f, 0.157822f, 0.020116f, -0.142337f, + 0.008199f, 0.046564f, 0.083014f, 0.046307f, 0.006771f, 0.084997f, 0.141935f, 0.228339f, + -0.020308f, 0.077745f, -0.018319f, -0.522311f, 0.010432f, 0.024641f, 0.020571f, 0.097148f, + 0.002064f, 0.035053f, -0.121995f, 0.012222f, -0.030779f, 0.100481f, -0.331737f, -1.257669f, + -0.013079f, 0.021227f, 0.159949f, 0.120097f, 0.005765f, -0.012335f, -0.005268f, 0.042067f, + -0.043972f, 0.102556f, 0.180494f, -0.084721f, -0.011962f, 0.031302f, 0.112511f, 0.027557f, + -0.002085f, 0.082978f, 0.149409f, 0.195091f, -0.033731f, 0.019861f, -0.064047f, -0.471328f, + -0.004093f, 0.016803f, 0.044635f, 0.058912f, -0.018735f, 0.035536f, -0.050373f, -0.002794f, + -0.086705f, 0.038435f, -0.301466f, -1.071246f, -0.028247f, 0.018984f, 0.254702f, 0.141142f, + -0.017522f, 0.014843f, 0.079391f, 0.079662f, -0.051204f, 0.048419f, 0.235604f, -0.185797f, + -0.019569f, 0.02678f, 0.162507f, 0.046435f, -0.004606f, 0.08806f, 0.18634f, 0.193957f, + -0.024333f, -0.01298f, -0.17977f, -0.65881f, -0.003778f, 0.007418f, 0.065439f, 0.104549f, + -0.027706f, 0.03301f, 0.057492f, 0.032019f, -0.135337f, 0.000269f, -0.250203f, -1.181688f, + -0.027022f, -0.006755f, 0.206848f, 0.129268f, -0.003529f, 0.013445f, 0.181484f, 0.139955f, + -0.036587f, 0.065824f, 0.288751f, -0.110813f, -0.015578f, 0.044818f, 0.17756f, 0.006914f, + 0.002329f, 0.068982f, 0.189079f, 0.184253f, 0.00301f, -0.039168f, -0.010855f, -0.393254f, + 0.000028f, 0.001906f, 0.07217f, 0.063305f, -0.026144f, 0.028842f, 0.139149f, 0.023377f, + 0.023362f, 0.023559f, -0.145386f, -0.863572f, -0.015749f, -0.021364f, 0.172571f, 0.078393f, + -0.037253f, 0.014978f, 0.221502f, 0.189111f, -0.048956f, 0.085409f, 0.325399f, -0.058294f, + -0.028495f, 0.021663f, 0.19392f, 0.02706f, 0.006908f, 0.065751f, 0.176395f, 0.138375f, + 0.012418f, -0.031228f, -0.008762f, -0.427345f, -0.013677f, -0.002429f, 0.069655f, 0.019505f, + -0.036763f, 0.022528f, 0.201062f, 0.022205f, 0.024528f, 0.06241f, -0.076237f, -0.840695f, + -0.007268f, -0.027865f, 0.211056f, 0.074744f, -0.053563f, 0.006863f, 0.301432f, 0.192879f, + -0.021944f, 0.100535f, 0.19031f, -0.133746f, -0.006151f, 0.023944f, 0.13561f, -0.03259f, + 0.000618f, 0.063736f, 0.180904f, 0.12393f, 0.001275f, -0.0306f, -0.032822f, -0.496515f, + 0.009757f, 0.014602f, 0.004532f, -0.039969f, -0.015984f, 0.047726f, 0.099865f, 0.003163f, + 0.026623f, 0.117951f, -0.076234f, -0.811997f, 0.01301f, 0.020042f, 0.173756f, -0.036191f, + -0.068887f, 0.0229f, 0.245465f, 0.214282f, -0.011054f, 0.132813f, 0.241014f, -0.148763f, + }, + std::vector{210, 350, 1, 1}, + std::vector{0.f, 11.9688f, 4.02532f, 204.528f, 182.586f, 0.f, 33.7915f, 48.4886f, 210.f, + 238.505f, 0.f, 0.f, 0.f, 204.428f, 337.029f, 0.f, 72.611f, 9.87545f, + 203.687f, 212.299f, 0.f, 5.08432f, 4.19913f, 208.719f, 249.225f, 0.f, 23.6503f, + 57.8165f, 210.f, 350.f, 0.f, 84.8804f, 9.47241f, 156.822f, 243.003f, 0.f, + 101.663f, 15.5542f, 166.083f, 327.839f, 0.f, 13.9738f, 0.f, 210.f, 128.482f, + 0.f, 77.8929f, 29.663f, 186.561f, 313.287f + + }, + std::vector< + T>{0.309179, 0.308222, 0.303045, 0.241702, 0.192754, 0.165789, 0.15553, 0.154976, 0.146637, 0.129511}, + "tensorflow"}, }; return proposalV4Params; } std::vector generateProposalV1CombinedParams() { - const std::vector> proposalTypeParams{ - generateProposalV1Params(), - generateProposalV1Params(), - generateProposalV1Params(), - generateProposalV1Params()}; + std::vector> proposalTypeParams{generateProposalV1Params(), + generateProposalV1Params(), + generateProposalV1Params(), + generateProposalV1Params()}; std::vector combinedParams; - - for (const auto& params : proposalTypeParams) { - combinedParams.insert(combinedParams.end(), params.begin(), params.end()); - } + for (auto& params : proposalTypeParams) + std::move(params.begin(), params.end(), std::back_inserter(combinedParams)); return combinedParams; } std::vector generateProposalV4CombinedParams() { - const std::vector> proposalTypeParams{ - generateProposalV4Params(), - generateProposalV4Params(), - generateProposalV4Params(), - generateProposalV4Params()}; + std::vector> proposalTypeParams{generateProposalV4Params(), + generateProposalV4Params(), + generateProposalV4Params(), + generateProposalV4Params()}; std::vector combinedParams; - - for (const auto& params : proposalTypeParams) { - combinedParams.insert(combinedParams.end(), params.begin(), params.end()); - } + for (auto& params : proposalTypeParams) + std::move(params.begin(), params.end(), std::back_inserter(combinedParams)); return combinedParams; } diff --git a/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp b/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp new file mode 100644 index 00000000000000..8f9687b7b93b2a --- /dev/null +++ b/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/subgraph/lora_pattern.hpp" + +namespace ov { +namespace test { + +TEST_P(LoraPatternMatmul, empty_tensors) { + targetStaticShapes = {{{{1, 20, K}}, {{N, K}}}}; + run_test_empty_tensors(); +} + +TEST_P(LoraPatternConvolution, empty_tensors) { + targetStaticShapes = {{{1, num_channels, 64, 64}}}; + run_test_empty_tensors(); +} + +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/subgraph_tests/rotary_pos_emb.hpp b/src/tests/functional/plugin/shared/include/subgraph_tests/rotary_pos_emb.hpp index f2b19a6748a6a7..7100ddca1083e3 100644 --- a/src/tests/functional/plugin/shared/include/subgraph_tests/rotary_pos_emb.hpp +++ b/src/tests/functional/plugin/shared/include/subgraph_tests/rotary_pos_emb.hpp @@ -87,5 +87,12 @@ TEST_P(RoPETestGPTJSlice, CompareWithRefs) { CheckNumberOfNodesWithType(function, {"RoPE"}, 1); }; +TEST_P(RoPETestChatGLM2DRoPEStridedSlice, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED(); + run(); + auto function = compiledModel.get_runtime_model(); + CheckNumberOfNodesWithType(function, {"RoPE"}, 1); +}; + } // namespace test } // namespace ov diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp new file mode 100644 index 00000000000000..16764d37dcf688 --- /dev/null +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/ov_subgraph.hpp" + +namespace ov { +namespace test { + +class LoraPatternBase : public SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj); + +protected: + void run_test_empty_tensors(); + +protected: + static constexpr auto t4_name = "lora/MatMul.B"; + static constexpr auto t5_name = "lora/MatMul.alpha"; + static constexpr auto t6_name = "lora/MatMul.A"; + static constexpr auto netType = ov::element::f32; +}; + +class LoraPatternMatmul : public LoraPatternBase, public testing::WithParamInterface { +public: + void SetUp() override; + +protected: + static constexpr size_t K = 563ul; // Weights matrix K dimension + static constexpr size_t N = 2048ul; // Weights matrix N dimension +}; + +class LoraPatternConvolution : public LoraPatternBase, public testing::WithParamInterface { +public: + void SetUp() override; + +protected: + static constexpr size_t num_channels = 320ul; +}; + +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp index 2663a6f5ad3fab..e1182bd3b16e13 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp @@ -115,5 +115,17 @@ class RoPETestGPTJSlice : public RoPETestGPTJStridedSlice { void SetUp() override; }; +class RoPETestChatGLM2DRoPEStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface { +private: + std::shared_ptr buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims); +protected: + ov::Tensor create_i32_tensor(const ov::Shape& shape, int start, int step = 1); + void generate_inputs(const std::vector& targetInputStaticShapes) override; + void SetUp() override; + +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj); +}; + } // namespace test } // namespace ov diff --git a/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp b/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp new file mode 100644 index 00000000000000..6f74fd09b022a6 --- /dev/null +++ b/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp @@ -0,0 +1,143 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "shared_test_classes/subgraph/lora_pattern.hpp" + +#include "common_test_utils/node_builders/eltwise.hpp" +#include "common_test_utils/node_builders/convolution.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" + +namespace ov { +namespace test { + + +std::string LoraPatternBase::getTestCaseName(const testing::TestParamInfo& obj) { + auto device_name = obj.param; + return std::string{"targetDevice="} + device_name; //NOLINT +} + +constexpr ov::element::Type LoraPatternBase::netType; //redundant variable definition for C++ prior to C++17 + +void LoraPatternBase::run_test_empty_tensors() { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + generate_inputs(targetStaticShapes.front()); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + } + + inferRequest.infer(); + auto outputs = function->outputs(); + + auto tx_result = inferRequest.get_tensor(outputs[0]); + auto tz_result = inferRequest.get_tensor(outputs[1]); + ov::test::utils::compare(tx_result, tz_result, 1e-4, 1e-4); +} + +void LoraPatternMatmul::SetUp() { + targetDevice = this->GetParam(); + + ov::PartialShape shape_x = {-1, -1, K}; + ov::PartialShape shape_w = {N, K}; + + auto param_y = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + + // "Main" matrix multiplication from the original transformer model + auto tx = std::make_shared(param_y, param_w, false, true); + + // LoRA parameters from states + auto variable_t4 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({N, -1}), netType, t4_name}); + auto t4 = std::make_shared(variable_t4); + auto t4_assign = std::make_shared(t4, variable_t4); + + auto variable_t5 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), netType, t5_name}); + auto t5 = std::make_shared(variable_t5); + auto t5_assign = std::make_shared(t5, variable_t5); + + auto variable_t6 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, K}), netType, t6_name}); + auto t6 = std::make_shared(variable_t6); + auto t6_assign = std::make_shared(t6, variable_t6); + + // Apply LoRA parameters to the current activations + auto t5810 = std::make_shared(param_y, t6, false, true); + auto t5811 = std::make_shared(t5810, t5); + auto t5812 = std::make_shared(t5811, t4, false, true); + + // Mix LoRA part into normally computed activations after the "main" MatMul + auto tz = std::make_shared(tx, t5812); + + auto result_x = std::make_shared(tx); + auto result_z = std::make_shared(tz); + + function = std::make_shared(ov::ResultVector({result_x, result_z}), + ov::SinkVector({t4_assign, t5_assign, t6_assign}), + ov::ParameterVector({param_y, param_w})); +} + +void LoraPatternConvolution::SetUp() { + targetDevice = this->GetParam(); + + ov::PartialShape shape_x = {-1, num_channels, -1, -1}; + + auto param_y = std::make_shared(netType, shape_x); + + // Original Convolution that is modified by LoRA adapter later + auto tx = ov::test::utils::make_convolution(param_y, + netType, + {1, 1}, + {1, 1}, + {0, 0}, + {0, 0}, + {1, 1}, + ov::op::PadType::EXPLICIT, + num_channels); + + // LoRA parameters from states + auto variable_t4 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({num_channels, -1}), netType, t4_name}); + auto t4 = std::make_shared(variable_t4); + auto t4_assign = std::make_shared(t4, variable_t4); + + auto variable_t5 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), netType, t5_name}); + auto t5 = std::make_shared(variable_t5); + auto t5_assign = std::make_shared(t5, variable_t5); + + auto variable_t6 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, num_channels}), netType, t6_name}); + auto t6 = std::make_shared(variable_t6); + auto t6_assign = std::make_shared(t6, variable_t6); + + // LoRA pattern with additional Transposes to move channel dimensions into positions where MatMul can be applied + auto t4940 = + std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{2, 3, 0, 1}); + + auto t4941 = std::make_shared(param_y, t4940); + auto t4942 = std::make_shared(t4941, t6, false, true); + auto t4943 = std::make_shared(t4942, t5); + auto t4944 = std::make_shared(t4943, t4, false, true); + + auto t4945 = + std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{2, 3, 0, 1}); + auto t4946 = std::make_shared(t4944, t4945); + + // Mix LoRA part into normally computed activations after the "main" MatMul + auto tz = std::make_shared(tx, t4946); + + auto result_x = std::make_shared(tx); + auto result_z = std::make_shared(tz); + + function = std::make_shared(ov::ResultVector({result_x, result_z}), + ov::SinkVector({t4_assign, t5_assign, t6_assign}), + ov::ParameterVector({param_y})); +} + +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp b/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp index 46ea730ac32a8c..a1848903bb76a2 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp @@ -1027,5 +1027,129 @@ std::shared_ptr RoPETestGPTJSlice::buildROPE_GPTJ(int num_head, return std::make_shared(model_output, ov::ParameterVector{input, sincos}); } +std::shared_ptr RoPETestChatGLM2DRoPEStridedSlice::buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims) { + auto input = std::make_shared(ov::element::f32, PartialShape{batch, -1, 4096 + 256 + 256}); + auto cos_sin_cache = std::make_shared(ov::element::f32, PartialShape{32768, 32, 2}); + auto position_ids = std::make_shared(ov::element::i32, PartialShape{-1, -1}); + + auto __module_transformer_index_67_Gather = + makeOP({cos_sin_cache, position_ids, 0}, {{"batch_dims", 0}}); + + auto ListUnpack_321 = makeOP({input, -1, {4096, 256, 256}}); + auto view_Reshape = makeOP({ListUnpack_321->output(0), {0, 0, 32, 128}}, {{"special_zero", true}}); + + auto permute_Transpose = makeOP({view_Reshape, {0, 2, 1, 3}}, {}); + + auto slice_Slice_357 = + makeOP({permute_Transpose, {0, 0, 0, 0}, {0, 0, 0, 64}, {1, 1, 1, 1}}, + {{"begin_mask", {1, 1, 1, 0}}, + {"end_mask", {1, 1, 1, 0}}, + {"new_axis_mask", {}}, + {"shrink_axis_mask", {}}, + {"ellipsis_mask", {}}}); + + auto aten_view_Reshape_1 = makeOP({ListUnpack_321->output(1), {0, 0, 2, 128}}, {{"special_zero", true}}); + auto aten_transpose_1 = makeOP({aten_view_Reshape_1, {0, 2, 1, 3}}); + auto shape_of_105249 = makeOP({aten_transpose_1}, {{"output_type", "i32"}}); + auto gather_105252 = makeOP({shape_of_105249, {2}, {0}}, {{"batch_dims", 0}}); + auto scatter_update_63441 = makeOP({{0, 0}, {1}, gather_105252, {0}}); + // connected to cos_sin_cache + auto slice_Slice_369 = + makeOP({__module_transformer_index_67_Gather, {0, 0}, scatter_update_63441, {1, 1}}, + {{"begin_mask", {1, 0}}, + {"end_mask", {1, 0}}, + {"new_axis_mask", {}}, + {"shrink_axis_mask", {}}, + {"ellipsis_mask", {}}}); + auto list_construct_concat_1 = makeOP({{-1}, {1}, gather_105252, {32}, {2}}, {{"axis", 0}}); + + auto reshape_Reshape_373 = + makeOP({slice_Slice_357, {0, 32, 0, 32, 2}}, {{"special_zero", true}}); + auto select_Gather_384 = makeOP({reshape_Reshape_373, 0, -1}, {{"batch_dims", 0}});//x_even + auto select_Gather_381 = makeOP({reshape_Reshape_373, 1, -1}, {{"batch_dims", 0}});//x_odd + + auto view_Reshape_380 = + makeOP({slice_Slice_369, list_construct_concat_1}, {{"special_zero", false}}); + auto select_Gather_385 = makeOP({view_Reshape_380, 0, -1}, {{"batch_dims", 0}});//cos_tab + auto select_Gather_382 = makeOP({view_Reshape_380, 1, -1}, {{"batch_dims", 0}});//sin_tab + + auto mul_Multiply_386 = + makeOP({select_Gather_381, select_Gather_382}, {{"auto_broadcast", "numpy"}});//x_odd_sin + auto mul_Multiply_383 = + makeOP({select_Gather_384, select_Gather_385}, {{"auto_broadcast", "numpy"}});//x_even_cos + auto sub_Subtract_389 = + makeOP({mul_Multiply_383, mul_Multiply_386}, {{"auto_broadcast", "numpy"}}); + + auto mul_Multiply_391 = + makeOP({select_Gather_381, select_Gather_385}, {{"auto_broadcast", "numpy"}});//x_odd_cos + auto mul_Multiply_393 = + makeOP({select_Gather_384, select_Gather_382}, {{"auto_broadcast", "numpy"}});//x_even_sin + auto add_Add_396 = makeOP({mul_Multiply_391, mul_Multiply_393}, {{"auto_broadcast", "numpy"}}); + + auto Unsqueeze_62716 = makeOP({sub_Subtract_389, -1}, {}); + auto Unsqueeze_62717 = makeOP({add_Add_396, -1}, {}); + + auto stack_401 = makeOP({Unsqueeze_62716, Unsqueeze_62717}, {{"axis", -1}}); + auto flatten_Reshape_421 = makeOP({stack_401, {0, 32, 0, 64}}, {{"special_zero", true}}); + auto slice_Slice_363 = + makeOP({permute_Transpose, {0, 0, 0, 64}, {0, 0, 0, INT_MAX}, {1, 1, 1, 1}}, + {{"begin_mask", {1, 1, 1, 0}}, + {"end_mask", {1, 1, 1, 0}}, + {"new_axis_mask", {}}, + {"shrink_axis_mask", {}}, + {"ellipsis_mask", {}}}); + auto cat_Concat_425 = makeOP({flatten_Reshape_421, slice_Slice_363}, {{"axis", -1}}); + return std::make_shared(ov::NodeVector{cat_Concat_425}, + ov::ParameterVector{input, cos_sin_cache, position_ids}); +} + +ov::Tensor RoPETestChatGLM2DRoPEStridedSlice::create_i32_tensor(const ov::Shape& shape, int start, int step) { + auto tensor = ov::Tensor(ov::element::i32, shape); + auto* ptr = static_cast(tensor.data()); + for (size_t i = 0; i < tensor.get_size(); i++) { + ptr[i] = start; + start += step; + } + return tensor; +} + +void RoPETestChatGLM2DRoPEStridedSlice::generate_inputs(const std::vector& targetInputStaticShapes) { + const auto& funcInputs = function->inputs(); + + auto& input_shape = targetInputStaticShapes[0]; + auto batch = input_shape[0]; + auto seq_length = input_shape[1]; + + ov::Tensor t_input = utils::create_and_fill_tensor(funcInputs[0].get_element_type(), input_shape, 2, -1.0f, 32768); + ov::Tensor t_cos_sin_cache = + utils::create_and_fill_tensor(funcInputs[1].get_element_type(), {32768, 32, 2}, 2, -1.0f, 32768); + ov::Tensor t_position_ids = create_i32_tensor(ov::Shape({batch, seq_length}), 15); + + inputs.clear(); + inputs.insert({funcInputs[0].get_node_shared_ptr(), t_input}); + inputs.insert({funcInputs[1].get_node_shared_ptr(), t_cos_sin_cache}); + inputs.insert({funcInputs[2].get_node_shared_ptr(), t_position_ids}); +} + +void RoPETestChatGLM2DRoPEStridedSlice::SetUp() { + targetDevice = this->GetParam(); + + const int batch = 2; + const int seq_length = 7; + const int num_head = 32; + const int rotary_dims = 64; + + InputShape inpShape = {{batch, -1, 4096 + 256 + 256}, {{batch, seq_length, 4096 + 256 + 256}}}; + init_input_shapes({inpShape}); + function = buildROPE_ChatGLM(-1, num_head, rotary_dims); +} + +std::string RoPETestChatGLM2DRoPEStridedSlice::getTestCaseName(const testing::TestParamInfo& obj) { + std::string targetDevice = obj.param; + std::ostringstream result; + result << "targetDevice=" << targetDevice; + return result.str(); +} + } // namespace test } // namespace ov diff --git a/tests/constraints.txt b/tests/constraints.txt index f09da0d3b409e9..616aea79c82153 100644 --- a/tests/constraints.txt +++ b/tests/constraints.txt @@ -1,4 +1,4 @@ -numpy>=1.16.6,<1.27 +numpy>=1.16.6,<2.1.0 attrs==23.2.0 distro==1.9.0 h5py>=3.1.0,<3.12.0 @@ -6,7 +6,6 @@ Jinja2>=2.11.2 pandas>=1.3.5 pymongo>=3.12.0 PyYAML>=5.4.1 -scipy>=1.7; python_version <= "3.8" scipy>=1.11.1; python_version >= "3.9" sympy>=1.10 wheel>=0.38.1 diff --git a/tests/layer_tests/requirements.txt b/tests/layer_tests/requirements.txt index 6799b32036df97..cb8e71f0c7fe7f 100644 --- a/tests/layer_tests/requirements.txt +++ b/tests/layer_tests/requirements.txt @@ -4,9 +4,9 @@ numpy onnxruntime requests torch -torchvision; platform_machine == 'arm64' and python_version >= '3.8' +torchvision; platform_machine == 'arm64' and python_version >= '3.9' torchvision; platform_machine != 'arm64' -sympy; platform_machine == 'arm64' and python_version >= '3.8' +sympy; platform_machine == 'arm64' and python_version >= '3.9' sympy; platform_machine != 'arm64' transformers packaging diff --git a/thirdparty/open_model_zoo b/thirdparty/open_model_zoo index f798fd62d66c27..e7df86da686d2e 160000 --- a/thirdparty/open_model_zoo +++ b/thirdparty/open_model_zoo @@ -1 +1 @@ -Subproject commit f798fd62d66c273c757ab9c6038a47a364b726d0 +Subproject commit e7df86da686d2e1600282422e54f66c2fecea160 diff --git a/tools/ovc/openvino/tools/ovc/convert.py b/tools/ovc/openvino/tools/ovc/convert.py index 782fa25ab2dd8b..77693ad4be2ca1 100644 --- a/tools/ovc/openvino/tools/ovc/convert.py +++ b/tools/ovc/openvino/tools/ovc/convert.py @@ -27,7 +27,7 @@ def convert_model( Framework-agnostic parameters: :param input_model: - Model object in original framework (PyTorch, Tensorflow) or path to model file. + Model object in original framework (PyTorch, TensorFlow) or path to model file. Supported formats of input model: @@ -35,6 +35,7 @@ def convert_model( torch.nn.Module torch.jit.ScriptModule torch.jit.ScriptFunction + torch.export.ExportedProgram TF tf.compat.v1.Graph diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py index 0119a541494cb9..d3b77c9a61f566 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py @@ -89,37 +89,38 @@ def get_pytorch_decoder_for_model_on_disk(argv, args): else: input_model = argv.input_model - if isinstance(input_model, (str, pathlib.Path)): - # attempt to load scripted model - try: - inputs = prepare_torch_inputs(example_inputs) - model = torch.jit.load(input_model) - model.eval() - decoder = TorchScriptPythonDecoder( - model, - example_input=inputs, - shared_memory=args.get("share_weights", True), - module_extensions=extract_module_extensions(args)) + if not isinstance(input_model, (str, pathlib.Path)): + return False + + # attempt to load scripted model + try: + inputs = prepare_torch_inputs(example_inputs) + model = torch.jit.load(input_model) + model.eval() + decoder = TorchScriptPythonDecoder( + model, + example_input=inputs, + shared_memory=args.get("share_weights", True), + module_extensions=extract_module_extensions(args)) + argv.input_model = decoder + argv.framework = 'pytorch' + return True + except: + pass + # attempt to load exported model + try: + exported_program = torch.export.load(input_model) + if hasattr(torch, "export") and isinstance(exported_program, (torch.export.ExportedProgram)): + from packaging import version + if version.parse(torch.__version__) >= version.parse("2.2"): + exported_program = exported_program.run_decompositions() + gm = exported_program.module() + decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True) argv.input_model = decoder argv.framework = 'pytorch' return True - except: - pass - if isinstance(input_model, (str, pathlib.Path)): - # attempt to load exported model - try: - exported_program = torch.export.load(input_model) - if hasattr(torch, "export") and isinstance(exported_program, (torch.export.ExportedProgram)): - from packaging import version - if version.parse(torch.__version__) >= version.parse("2.2"): - exported_program = exported_program.run_decompositions() - gm = exported_program.module() - decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True) - argv.input_model = decoder - argv.framework = 'pytorch' - return True - except: - pass + except: + pass return False